[gtk-doc] Always open files in text mode and always use utf-8

From: Stefan Sauer <stefkost src gnome org>
To: commits-list gnome org
Cc:
Subject: [gtk-doc] Always open files in text mode and always use utf-8
Date: Thu, 26 Oct 2017 18:41:57 +0000 (UTC)
commit 1eeec38a9a06a9956cdab9789cbd2ea105866198
Author: Christoph Reiter <reiter christoph gmail com>
Date:   Mon Sep 18 22:49:09 2017 +0200

    Always open files in text mode and always use utf-8
    
    Introduces a common.open_text() helper with saner defaults for opening
    text files across Python versions.
    
    open() defaults to the locale encoding which on a properly configured
    Unix is utf-8, but cp-1252 on Windows which can't handle all of Unicode.
    Instead of using the default always use utf-8 for text files.
    
    To reduce the difference of types processed by Python 2 vs 3 use
    codecs.open() to open text files in text mode on Python 2. The
    resulting file object will return unicode like on Python 3, but still
    allows passing in ASCII only str.
    
    Also fixes a few missing file.close() operations, which is important on
    Windows as non-closed files can't be renamed/deleted on Windows.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=787862

 gtkdoc/check.py    |    8 ++++--
 gtkdoc/common.py   |   25 ++++++++++++++++++++++++
 gtkdoc/fixxref.py  |   15 +++++--------
 gtkdoc/mkdb.py     |   53 ++++++++++++++++++++++++++-------------------------
 gtkdoc/mkhtml.py   |    3 +-
 gtkdoc/mkpdf.py    |    6 +++-
 gtkdoc/rebase.py   |    9 ++++---
 gtkdoc/scan.py     |    8 +++---
 gtkdoc/scangobj.py |    4 +-
 9 files changed, 80 insertions(+), 51 deletions(-)
---
diff --git a/gtkdoc/check.py b/gtkdoc/check.py
index 0a58b0c..7f32b60 100755
--- a/gtkdoc/check.py
+++ b/gtkdoc/check.py
@@ -32,6 +32,8 @@ import os
 import re
 from glob import glob
 
+from . import common
+
 
 class FileFormatError(Exception):
     pass
@@ -46,14 +48,14 @@ def grep(regexp, lines, what):
 
 
 def check_empty(filename):
-    with open(filename) as f:
+    with open(filename, "rb") as f:
         count = sum(1 for line in f if line.strip())
     return count
 
 
 def check_includes(filename):
     # Check that each XML file in the xml directory is included in doc_main_file
-    with open(filename) as f:
+    with common.open_text(filename) as f:
         lines = f.read().splitlines()
         num_missing = 0
         for include in glob('xml/*.xml'):
@@ -73,7 +75,7 @@ def get_variable(env, lines, variable):
 
 
 def read_file(filename):
-    with open(filename) as f:
+    with common.open_text(filename) as f:
         return f.read().splitlines()
 
 
diff --git a/gtkdoc/common.py b/gtkdoc/common.py
index 50c9f5f..9dfebef 100644
--- a/gtkdoc/common.py
+++ b/gtkdoc/common.py
@@ -28,10 +28,35 @@ import os
 import re
 import subprocess
 import sys
+import six
+import codecs
 
 from . import config
 
 
+def open_text(filename, mode="r", encoding="utf-8"):
+    """An open() which removes some differences between Python 2 and 3 and
+    has saner defaults.
+
+    Unlike the builtin open by default utf-8 is use and not the locale
+    encoding (which is ANSI on Windows for example, not very helpful)
+
+    For Python 2, files are opened in text mode like with Python 3.
+    """
+
+    if mode not in ("r", "w"):
+        raise ValueError("mode %r not supported, must be 'r' or 'w'" % mode)
+
+    if six.PY3:
+        return open(filename, mode, encoding=encoding)
+    else:
+        # We can't use io.open() here as its write method is too strict and
+        # only allows unicode instances and not everything in the codebase
+        # forces unicode at the moment. codecs.open() on the other hand
+        # happily takes ASCII str and decodes it.
+        return codecs.open(filename, mode, encoding=encoding)
+
+
 def setup_logging():
     """Check GTKDOC_TRACE environment variable.
 
diff --git a/gtkdoc/fixxref.py b/gtkdoc/fixxref.py
index 3027c10..164b008 100755
--- a/gtkdoc/fixxref.py
+++ b/gtkdoc/fixxref.py
@@ -175,7 +175,7 @@ def ReadDevhelp(file, use_absolute_links):
 
     logging.info('Scanning index file=%s, absolute=%d, dir=%s', file, use_absolute_links, dir)
 
-    for line in open(file):
+    for line in common.open_text(file):
         m = re.search(r' link="([^#]*)#([^"]*)"', line)
         if m:
             link = m.group(1) + '#' + m.group(2)
@@ -184,7 +184,7 @@ def ReadDevhelp(file, use_absolute_links):
 
 
 def ReadSections(options):
-    for line in open(options.module + '-sections.txt'):
+    for line in common.open_text(options.module + '-sections.txt'):
         m1 = re.search(r'^<SUBSECTION\s*(.*)>', line)
         if line.startswith('#') or line.strip() == '':
             continue
@@ -220,9 +220,7 @@ def FixCrossReferences(options):
 def FixHTMLFile(options, file):
     logging.info('Fixing file: %s', file)
 
-    content = open(file).read()
-    if sys.version_info < (3,):
-        content = content.decode('utf-8')
+    content = common.open_text(file).read()
 
     if config.highlight:
         # FIXME: ideally we'd pass a clue about the example language to the highligher
@@ -266,9 +264,8 @@ def FixHTMLFile(options, file):
 
     new_file = file + '.new'
     content = '\n'.join(lines)
-    if sys.version_info < (3,):
-        content = content.encode('utf-8')
-    open(new_file, 'w').write(content)
+    with common.open_text(new_file, 'w') as h:
+        h.write(content)
 
     os.unlink(file)
     os.rename(new_file, file)
@@ -381,7 +378,7 @@ def HighlightSourceVim(options, type, source):
         script += "%s -n -e -u NONE -T xterm >/dev/null" % config.highlight
         subprocess.check_call([script], shell=True)
 
-        highlighted_source = open(temp_source_file + ".html").read()
+        highlighted_source = common.open_text(temp_source_file + ".html").read()
         highlighted_source = re.sub(r'.*<pre\b[^>]*>\n', '', highlighted_source, flags=re.MULTILINE)
         highlighted_source = re.sub(r'</pre>.*', '', highlighted_source, flags=re.MULTILINE)
 
diff --git a/gtkdoc/mkdb.py b/gtkdoc/mkdb.py
index e103138..6c1538f 100644
--- a/gtkdoc/mkdb.py
+++ b/gtkdoc/mkdb.py
@@ -302,7 +302,8 @@ def Run(options):
         OutputSinceIndexes()
         OutputAnnotationGlossary()
 
-        open(os.path.join(ROOT_DIR, 'sgml.stamp'), 'w').write('timestamp')
+        with open(os.path.join(ROOT_DIR, 'sgml.stamp'), 'w') as h:
+            h.write('timestamp')
 
 
 def OutputObjectList():
@@ -315,7 +316,7 @@ def OutputObjectList():
     old_object_index = os.path.join(DB_OUTPUT_DIR, "object_index.sgml")
     new_object_index = os.path.join(DB_OUTPUT_DIR, "object_index.new")
 
-    OUTPUT = open(new_object_index, 'w')
+    OUTPUT = common.open_text(new_object_index, 'w')
 
     OUTPUT.write('''%s
 <informaltable pgwide="1" frame="none">
@@ -377,7 +378,7 @@ def OutputDB(file, options):
     """
 
     logging.info("Reading: %s", file)
-    INPUT = open(file)
+    INPUT = common.open_text(file)
     filename = ''
     book_top = ''
     book_bottom = ''
@@ -1001,7 +1002,7 @@ def OutputAnnotationGlossary():
                 rerun = True
                 break
 
-    OUTPUT = open(new_glossary, 'w')
+    OUTPUT = common.open_text(new_glossary, 'w')
 
     OUTPUT.write('''%s
 <glossary id="annotation-glossary">
@@ -1049,7 +1050,7 @@ def ReadKnownSymbols(file):
     subsection = ''
 
     logging.info("Reading: %s", file)
-    INPUT = open(file)
+    INPUT = common.open_text(file)
     for line in INPUT:
         if line.startswith('#'):
             continue
@@ -2157,7 +2158,7 @@ def OutputDBFile(file, title, section_id, includes, functions_synop, other_synop
     old_db_file = os.path.join(DB_OUTPUT_DIR, file + '.xml')
     new_db_file = os.path.join(DB_OUTPUT_DIR, file + '.xml.new')
 
-    OUTPUT = open(new_db_file, 'w')
+    OUTPUT = common.open_text(new_db_file, 'w')
 
     object_anchors = ''
     for fobject in file_objects:
@@ -2324,7 +2325,7 @@ def OutputProgramDBFile(program, section_id):
     old_db_file = os.path.join(DB_OUTPUT_DIR, program + ".xml")
     new_db_file = os.path.join(DB_OUTPUT_DIR, program + ".xml.new")
 
-    OUTPUT = open(new_db_file, 'w')
+    OUTPUT = common.open_text(new_db_file, 'w')
 
     OUTPUT.write('''%s
 <refentry id="%s">
@@ -2367,9 +2368,9 @@ def OutputExtraFile(file):
     old_db_file = os.path.join(DB_OUTPUT_DIR, basename)
     new_db_file = os.path.join(DB_OUTPUT_DIR, basename + ".new")
 
-    contents = open(file).read()
+    contents = common.open_text(file).read()
 
-    OUTPUT = open(new_db_file, 'w')
+    OUTPUT = common.open_text(new_db_file, 'w')
     OUTPUT.write(ExpandAbbreviations(basename + " file", contents))
     OUTPUT.close()
 
@@ -2378,7 +2379,7 @@ def OutputExtraFile(file):
 
 def GetDocbookHeader(main_file):
     if os.path.exists(main_file):
-        INPUT = open(main_file)
+        INPUT = common.open_text(main_file)
         header = ''
         for line in INPUT:
             if re.search(r'^\s*<(book|chapter|article)', line):
@@ -2421,7 +2422,7 @@ def OutputBook(main_file, book_top, book_bottom):
     old_file = os.path.join(DB_OUTPUT_DIR, MODULE + "-doc.top")
     new_file = os.path.join(DB_OUTPUT_DIR, MODULE + "-doc.top.new")
 
-    OUTPUT = open(new_file, 'w')
+    OUTPUT = common.open_text(new_file, 'w')
     OUTPUT.write(book_top)
     OUTPUT.close()
 
@@ -2430,7 +2431,7 @@ def OutputBook(main_file, book_top, book_bottom):
     old_file = os.path.join(DB_OUTPUT_DIR, MODULE + "-doc.bottom")
     new_file = os.path.join(DB_OUTPUT_DIR, MODULE + "-doc.bottom.new")
 
-    OUTPUT = open(new_file, 'w')
+    OUTPUT = common.open_text(new_file, 'w')
     OUTPUT.write(book_bottom)
     OUTPUT.close()
 
@@ -2439,7 +2440,7 @@ def OutputBook(main_file, book_top, book_bottom):
     # If the main docbook file hasn't been created yet, we create it here.
     # The user can tweak it later.
     if main_file and not os.path.exists(main_file):
-        OUTPUT = open(main_file, 'w')
+        OUTPUT = common.open_text(main_file, 'w')
 
         logging.info("no master doc, create default one at: " + main_file)
 
@@ -3665,7 +3666,7 @@ def ScanSourceFile(ifile, ignore_files):
 
     logging.info("Scanning source file: %s", ifile)
 
-    SRCFILE = open(ifile)
+    SRCFILE = common.open_text(ifile)
     in_comment_block = False
     symbol = None
     in_part = ''
@@ -3983,7 +3984,7 @@ def OutputMissingDocumentation():
     buffer_deprecated = ''
     buffer_descriptions = ''
 
-    UNDOCUMENTED = open(new_undocumented_file, 'w')
+    UNDOCUMENTED = common.open_text(new_undocumented_file, 'w')
 
     for symbol in sorted(iterkeys(AllSymbols)):
         # FIXME: should we print common.LogWarnings for undocumented stuff?
@@ -4064,7 +4065,7 @@ def OutputUndeclaredSymbols():
     old_undeclared_file = os.path.join(ROOT_DIR, MODULE + "-undeclared.txt")
     new_undeclared_file = os.path.join(ROOT_DIR, MODULE + "-undeclared.new")
 
-    UNDECLARED = open(new_undeclared_file, 'w')
+    UNDECLARED = common.open_text(new_undeclared_file, 'w')
 
     if UndeclaredSymbols:
         UNDECLARED.write("\n".join(sorted(iterkeys(UndeclaredSymbols))))
@@ -4089,7 +4090,7 @@ def OutputUnusedSymbols():
     old_unused_file = os.path.join(ROOT_DIR, MODULE + "-unused.txt")
     new_unused_file = os.path.join(ROOT_DIR, MODULE + "-unused.new")
 
-    UNUSED = open(new_unused_file, 'w')
+    UNUSED = common.open_text(new_unused_file, 'w')
 
     for symbol in sorted(iterkeys(Declarations)):
         if not symbol in DeclarationOutput:
@@ -4110,7 +4111,7 @@ def OutputUnusedSymbols():
 
 def OutputAllSymbols():
     """Outputs list of all symbols to a file."""
-    SYMBOLS = open(os.path.join(ROOT_DIR, MODULE + "-symbols.txt"), 'w')
+    SYMBOLS = common.open_text(os.path.join(ROOT_DIR, MODULE + "-symbols.txt"), 'w')
 
     for symbol in sorted(iterkeys(AllSymbols)):
         SYMBOLS.write(symbol + "\n")
@@ -4119,7 +4120,7 @@ def OutputAllSymbols():
 
 def OutputSymbolsWithoutSince():
     """Outputs list of all symbols without a since tag to a file."""
-    SYMBOLS = open(os.path.join(ROOT_DIR, MODULE + "-nosince.txt"), 'w')
+    SYMBOLS = common.open_text(os.path.join(ROOT_DIR, MODULE + "-nosince.txt"), 'w')
 
     for symbol in sorted(iterkeys(SourceSymbolDocs)):
         if symbol in Since:
@@ -4257,7 +4258,7 @@ def ReadDeclarationsFile(ifile, override):
         DeclarationConditional.clear()
         DeclarationOutput.clear()
 
-    INPUT = open(ifile)
+    INPUT = common.open_text(ifile)
     declaration_type = ''
     declaration_name = None
     declaration = None
@@ -4371,7 +4372,7 @@ def ReadSignalsFile(ifile):
     if not os.path.isfile(ifile):
         return
 
-    INPUT = open(ifile)
+    INPUT = common.open_text(ifile)
     line_number = 0
     for line in INPUT:
         line_number += 1
@@ -4437,7 +4438,7 @@ def ReadObjectHierarchy(ifile):
         logging.debug('no *-hierarchy.tx')
         return
 
-    INPUT = open(ifile)
+    INPUT = common.open_text(ifile)
 
     # Only emit objects if they are supposed to be documented, or if
     # they have documented children. To implement this, we maintain a
@@ -4488,7 +4489,7 @@ def ReadObjectHierarchy(ifile):
 
     logging.debug('got %d entries for hierarchy', len(tree))
 
-    OUTPUT = open(new_tree_index, 'w')
+    OUTPUT = common.open_text(new_tree_index, 'w')
     OUTPUT.write(MakeDocHeader("screen") + "\n<screen>\n" + AddTreeLineArt(tree) + "\n</screen>\n")
     OUTPUT.close()
 
@@ -4509,7 +4510,7 @@ def ReadInterfaces(ifile):
     if not os.path.isfile(ifile):
         return
 
-    INPUT = open(ifile)
+    INPUT = common.open_text(ifile)
 
     for line in INPUT:
         line = line.strip()
@@ -4542,7 +4543,7 @@ def ReadPrerequisites(ifile):
     if not os.path.isfile(ifile):
         return
 
-    INPUT = open(ifile)
+    INPUT = common.open_text(ifile)
 
     for line in INPUT:
         line = line.strip()
@@ -4593,7 +4594,7 @@ def ReadArgsFile(ifile):
     if not os.path.isfile(ifile):
         return
 
-    INPUT = open(ifile)
+    INPUT = common.open_text(ifile)
     line_number = 0
     for line in INPUT:
         line_number += 1
diff --git a/gtkdoc/mkhtml.py b/gtkdoc/mkhtml.py
index 80ddadf..3d03e0a 100644
--- a/gtkdoc/mkhtml.py
+++ b/gtkdoc/mkhtml.py
@@ -90,5 +90,6 @@ def run(options):
     for f in glob(styledir + '/*.png') + glob(styledir + '/*.css'):
         shutil.copy(f, '.')
 
-    open('../html.stamp', 'w').write('timestamp')
+    with open('../html.stamp', 'w') as h:
+        h.write('timestamp')
     return res
diff --git a/gtkdoc/mkpdf.py b/gtkdoc/mkpdf.py
index 5fd9ecb..485b487 100755
--- a/gtkdoc/mkpdf.py
+++ b/gtkdoc/mkpdf.py
@@ -37,7 +37,8 @@ def run_xsltproc(options, args):
         command += ['--path', path]
     pc = subprocess.Popen(command + args, stderr=subprocess.PIPE)
     (o, stde) = pc.communicate()
-    open('profile.txt', 'wb').write(stde)
+    with open('profile.txt', 'wb') as h:
+        h.write(stde)
     return pc.returncode
 
 
@@ -118,5 +119,6 @@ def run(options):
         print("dblatex or fop must be installed to use gtkdoc-mkpdf.")
         res = 1
 
-    open('pdf.stamp', 'w').write('timestamp')
+    with open('pdf.stamp', 'w') as h:
+        h.write('timestamp')
     return res
diff --git a/gtkdoc/rebase.py b/gtkdoc/rebase.py
index e518586..b329c51 100755
--- a/gtkdoc/rebase.py
+++ b/gtkdoc/rebase.py
@@ -136,7 +136,7 @@ gunzip %s/%s
 def ReadDevhelp(dir, file):
     onlinedir = None
 
-    for line in open(os.path.join(dir, file)):
+    for line in common.open_text(os.path.join(dir, file)):
         # online must come before chapter/functions
         if '<chapters' in line or '<functions' in line:
             break
@@ -150,7 +150,7 @@ def ReadDevhelp(dir, file):
 def ReadIndex(dir, file):
     onlinedir = None
 
-    for line in open(os.path.join(dir, file)):
+    for line in common.open_text(os.path.join(dir, file)):
         # ONLINE must come before any ANCHORs
         if '<ANCHOR' in line:
             break
@@ -207,10 +207,11 @@ def RebaseFile(filename, options):
     def repl_func(match):
         return match.group(1) + RebaseLink(match.group(2), options) + match.group(3)
 
-    contents = open(filename).read()
+    contents = common.open_text(filename).read()
     processed = re.sub(regex, repl_func, contents)
     newfilename = filename + '.new'
-    open(newfilename, 'w').write(processed)
+    with common.open_text(newfilename, 'w') as h:
+        h.write(processed)
     os.unlink(filename)
     os.rename(newfilename, filename)
 
diff --git a/gtkdoc/scan.py b/gtkdoc/scan.py
index 0843519..a106e48 100644
--- a/gtkdoc/scan.py
+++ b/gtkdoc/scan.py
@@ -77,16 +77,16 @@ def Run(options):
     for dir in options.source_dir:
         ScanHeaders(dir, section_list, decl_list, get_types, options)
 
-    with open(new_decl_list, 'w') as f:
+    with common.open_text(new_decl_list, 'w') as f:
         for section in sorted(iterkeys(section_list)):
             f.write(section_list[section])
 
-    with open(new_decl, 'w') as f:
+    with common.open_text(new_decl, 'w') as f:
         for decl in decl_list:
             f.write(decl)
 
     if options.rebuild_types:
-        with open(new_types, 'w') as f:
+        with common.open_text(new_types, 'w') as f:
             for func in sorted(get_types):
                 f.write(func + '\n')
 
@@ -226,7 +226,7 @@ def ScanHeader(input_file, section_list, decl_list, get_types, options):
 
     logging.info('Scanning %s', input_file)
 
-    for line in open(input_file):
+    for line in common.open_text(input_file):
         # If this is a private header, skip it.
         if re.search(r'%^\s*/\*\s*<\s*private_header\s*>\s*\*/', line):
             return
diff --git a/gtkdoc/scangobj.py b/gtkdoc/scangobj.py
index e6db6cd..116270a 100644
--- a/gtkdoc/scangobj.py
+++ b/gtkdoc/scangobj.py
@@ -1185,7 +1185,7 @@ MAIN_CODE_END = """
 def run(options):
 
     c_file = options.module + '-scan.c'
-    output = open(c_file, 'w')
+    output = common.open_text(c_file, 'w')
 
     base_filename = os.path.join(options.output_dir, options.module)
     old_signals_filename = base_filename + '.signals'
@@ -1206,7 +1206,7 @@ def run(options):
     get_types = ""
     ntypes = 1
 
-    for line in open(options.types):
+    for line in common.open_text(options.types):
         if line.startswith('#include'):
             includes += line
         elif line.startswith('%') or line.strip() == '':
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]