[gimp-help] tools: add new tool validate_po.py to test for errors in XML tags etc in po translations

From: Jacob Boerema <jboerema src gnome org>
To: commits-list gnome org
Cc:
Subject: [gimp-help] tools: add new tool validate_po.py to test for errors in XML tags etc in po translations
Date: Wed, 16 Feb 2022 20:05:44 +0000 (UTC)
commit a32a0a1ae9a29e3bd563b1c289a4645058d61c56
Author: Jacob Boerema <jgboerema gmail com>
Date:   Wed Feb 16 15:03:50 2022 -0500

    tools: add new tool validate_po.py to test for errors in XML tags etc in po translations
    
    Note: this requires python library polib: use pip install polib.
    
    I use it inside a script to check all po files of a language, like this:
    
    cd ~/gimp-help/po/$1
    find . -type f -print0 | xargs -0 ../../tools/validate_po.py $2 $3
    
    Where $1 is the language to test, $2 and $3 are optional parameters
    (--warnings and --verbose)
    
    Any single file can also be tested in the same way, without using a script.

 tools/validate_po.py | 343 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 343 insertions(+)
---
diff --git a/tools/validate_po.py b/tools/validate_po.py
new file mode 100644
index 000000000..b70c0a926
--- /dev/null
+++ b/tools/validate_po.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+#
+# validate_po.py - Validates correct use of XML tags and other possible
+#                  causes of problems in po files.
+# Copyright (c) 2021, 2022 Jacob Boerema.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+# Requirements:
+# - polib (pip install polib)
+
+import sys
+import getopt
+import polib
+
+VERSION = 0.2
+
+class Validate(object):
+    def __init__(self, verbose, warnings):
+        self.pofile = None
+        self.po = None
+        self.errors = 0
+        self.filecnt = 0
+        self.files_with_errors = 0
+        self.log=sys.stdout
+        self.verbose = verbose
+        self.warnings = warnings
+
+    def setFile(self, pofile):
+        self.pofile = pofile
+        self.po = polib.pofile(pofile)
+        self.filecnt += 1
+
+    def printErrorHeader(self, entry, log):
+        if self.headerPrinted:
+            return
+        self.headerPrinted = True
+        if not self.verbose:
+            print(f"\nPo file: {self.pofile}")
+            print(f"Po line number: {entry.linenum}")
+        else:
+            print(f"\nPo line number: {entry.linenum}")
+        print(f"Source text: {entry.occurrences}")
+        #print(f"Comment: {entry.comment}")
+        #print(f"Translator Comment: {entry.tcomment}")
+        #print(f"Context: {entry.msgctxt}")
+        if self.verbose:
+                print(f"\nOriginal msgid:\n{entry.msgid}", file=self.log)
+                print(f"\nTranslated msgstr:\n{entry.msgstr}\n", file=self.log)
+
+    def parse_text(self, entry):
+        stack = []
+        textblock = entry.msgstr
+        err = 0
+        #self.headerPrinted = False
+
+        # Note that XML tags do not get escaped. This means we can't really detect
+        # the difference between the use of < and > signs and tag begin/end.
+        # For now let's just assume that all occurrences of < and > are for tags.
+
+        start_tag = textblock.find('<')
+        while start_tag > -1:
+            textblock = textblock[start_tag+1:]
+            end_tag = textblock.find('>')
+            if end_tag > 2 and textblock[0] == '!' and textblock[1] == '-' and textblock[2] == '-':
+                # Start of comment inside a translation is suspicious!
+                if self.warnings:
+                    self.printErrorHeader(entry, self.log)
+                    print(f"WARNING: Suspicious XML comment found. Skipping contents of comment.", 
file=self.log)
+                end_comment = textblock.find('-->')
+                if end_comment > -1:
+                    textblock = textblock[end_comment+3:]
+                    start_tag = textblock.find('<')
+                    continue
+                else:
+                    if self.warnings:
+                        print(f"WARNING: XML comment closing tag missing", file=self.log)
+                    start_tag = -1
+                    continue
+
+            if end_tag > -1:
+                # Found left and right brackets: grab tag
+                tag = textblock[: end_tag]
+                # Check that it's not a tag that closes itself and comment tags starting with <!
+                if textblock[end_tag-1] != '/' and textblock[0] != '!':
+                    # check for closing tag first
+                    if len(tag) > 0 and tag[0] == '/':
+                        if tag[1] == ' ':
+                            # space not allowed when closing tag between / and tag
+                            err += 1
+                            self.printErrorHeader(entry, self.log)
+                            tag = tag[2:]
+                            print(f"ERROR: No space allowed between '/' and [{tag}] when closing a tag.", 
file=self.log)
+                        else:
+                            tag = tag[1:]
+                        # space is allowed when closing tag, however I think it looks ugly,
+                        # so let's issue a warning
+                        space = tag.find(' ')
+                        if space > -1:
+                            #err += 1
+                            tag = tag[: space]
+                            if self.warnings:
+                                self.printErrorHeader(entry, self.log)
+                                print(f"WARNING: Unnecessary space between tag and closing bracket, see 
[{tag}].", file=self.log)
+                        # no uppercase allowed in tag
+                        lo_tag = tag.lower()
+                        if tag != lo_tag:
+                            self.printErrorHeader(entry, self.log)
+                            err += 1
+                            print(f"ERROR: Closing tag [{tag}] should be all lowercase.", file=self.log)
+                            tag = lo_tag
+                        if len(stack) == 0:
+                            self.printErrorHeader(entry, self.log)
+                            err += 1
+                            print(f"ERROR: Closing tag [{tag}] before opening tag.", file=self.log)
+                        else:
+                            if stack[-1] == tag:
+                                # Correct closing tag found, remove from stack
+                                stack.pop()
+                            else:
+                                self.printErrorHeader(entry, self.log)
+                                err += 1
+                                print(f"ERROR: Found closing tag [{tag}], however we expected 
[{stack[0]}].", file=self.log)
+                                print(f"\tRemaining tags: {str(stack)}", file=self.log)
+                                if tag in stack:
+                                    stack.remove(tag)
+                                    print("\t  Assuming incorrect tag order, found and removed tag from the 
stack", file=self.log)
+                                elif len(stack) == 1:
+                                    stack.pop()
+                                    print("\t  Assuming typo, removed remaining tag from the stack", 
file=self.log)
+
+                    else:
+                        # Tag can have multiple elements inside, watch for first space
+                        space = tag.find(' ')
+                        err_space = False
+                        # Get rid of unlikely occurrence of multiple spaces before opening tag
+                        while space == 0:
+                            err_space = True
+                            tag = tag[1:]
+                            space = tag.find(' ')
+                            if space == -1:
+                                break
+
+                        if space > -1:
+                            tag = tag[: space]
+
+                        skip = False
+                        if err_space:
+                            if textblock[end_tag-1] == ' ':
+                                # Assume these are random < and > characters not a tag
+                                if self.warnings:
+                                    self.printErrorHeader(entry, self.log)
+                                    print(f"WARNING: Assuming random < and > encountered, but could be a 
faulty tag too.", file=self.log)
+                                skip = True
+                            else:
+                                # Suspicious, erroneous space(s) before tag name?
+                                self.printErrorHeader(entry, self.log)
+                                err += 1
+                                print(f"ERROR: No space allowed when opening a tag, see [{tag}].", 
file=self.log)
+
+                        if not skip:
+                            open_tag = (len(tag) > 0)
+                            if open_tag:
+                                # no uppercase allowed in tag
+                                lo_tag = tag.lower()
+                                if tag != lo_tag:
+                                    self.printErrorHeader(entry, self.log)
+                                    err += 1
+                                    print(f"ERROR: Opening tag [{tag}] should be all lowercase.", 
file=self.log)
+                                    tag = lo_tag
+                                # Add opening tag to stack
+                                stack.append(tag)
+                            else:
+                                self.printErrorHeader(entry, self.log)
+                                err += 1
+                                print(f"ERROR: Empty opening tag <> not allowed.", file=self.log)
+                else:
+                    if len(tag) == 1:
+                        # empty closing tag not allowed
+                        err += 1
+                        self.printErrorHeader(entry, self.log)
+                        tag = tag[1:]
+                        print(f"ERROR: Empty closing tag not allowed.", file=self.log)
+
+                textblock = textblock[end_tag+1:]
+                start_tag = textblock.find('<')
+            else:
+                start_tag = -1
+
+
+        if len(stack):
+            err += 1
+            self.printErrorHeader(entry, self.log)
+            print(f"ERROR: Found tags that were not closed: {str(stack)}.", file=self.log)
+
+        return err
+
+    def check_illegal_chars(self, entry):
+        text = entry.msgstr
+        result = 0
+        idx = 0
+        textlen = len(text)
+
+        # msgfmt doesn't like vertical tab (\v) so detect it here...
+        # See: https://gitlab.gnome.org/GNOME/gimp-help/-/commit/6b661af55bc6dc90bb198c59702d9b6cfc42f94f
+        # Problem is that polib unescapes all strings, but doesn't handle \v
+        vert_tab = text.find("\\v")
+        if vert_tab > -1:
+            #result += 1
+            if self.warnings:
+                # for now just a warning until polib handles \v
+                self.printErrorHeader(entry, self.log)
+                print(f"WARNING: Possible vertical tab (\\v) detected. Note: could be a false positive.", 
file=self.log)
+            #print(f"ERROR: Vertical tab (\\v) not allowed.", file=self.log)
+
+        while idx < textlen:
+            if text[idx] < ' ':
+                if text[idx] != "\n" and text[idx] != "\t":
+                    result += 1
+                    self.printErrorHeader(entry, self.log)
+                    print(f"ERROR: Found illegal character in text: {ord(text[idx])}.", file=self.log)
+
+            idx += 1
+
+        return result
+
+
+    def run(self):
+        errcnt = 0
+        valid_entries = [e for e in self.po if not e.obsolete]
+        for entry in valid_entries:
+            skip_parse = False
+            self.headerPrinted = False
+
+            if entry.msgid.startswith("@@image:"):
+                # TODO: check and warn for img not found msg?
+                # But that probably shouldn't be done in the
+                # po validator but in a source xml validator.
+                continue
+            elif entry.msgid.startswith("translator-credits"):
+                # TODO: Should we check for translators not in AUTHORS?
+                #continue
+                # Parsing would give errors on email address like <a@b.c> because
+                # it considers that a tag without closing tag
+                # However, we do want to check for chars like \v
+                skip_parse = True
+
+            errcnt += self.check_illegal_chars(entry)
+            if not skip_parse:
+                errcnt += self.parse_text(entry)
+
+            # TODO Possible enhancements:
+            #      - Compare tags found in msgid and msgstr
+            #        The same tags and the same number of tags should be found
+            #      - compare msgid and msgstr also for differences in:
+            #        + number of brackets () [] {}
+            #        + number of sentences
+            #        + use of quotes (which can differ per language)
+
+        if errcnt > 0:
+            self.errors += errcnt
+            self.files_with_errors += 1
+            print(f"\n{self.pofile}: {errcnt} errors detected.\n")
+
+def printVersion():
+    print(f"\nPo Validator v {VERSION}, copyright 2021, 2022 Jacob Boerema")
+
+def main(argv):
+    # Make sure stdout and stderr output utf-8 even on Windows where it's not the default
+    sys.stdout = open(sys.stdout.fileno(), 'w', encoding='utf-8', closefd=False)
+    sys.stderr = open(sys.stderr.fileno(), 'w', encoding='utf-8', closefd=False)
+
+    verbose = False
+    warnings = False
+
+    try:
+        opts, remaining_args = getopt.getopt(argv, "hvw",
+            [
+                "help", "verbose", "warnings"
+            ])
+    except getopt.GetoptError as err:
+        usage()
+        sys.exit(0)
+
+    for opt, arg in opts:
+        if opt == "-h" or opt == "--help":
+            usage()
+            sys.exit(0)
+        elif opt == "-v" or opt == "--verbose":
+            verbose = True
+        elif opt == "-w" or opt == "--warnings":
+            warnings = True
+
+    # Treat remaining arguments as po files
+    filenames = []
+    while remaining_args:
+        filenames.append(remaining_args.pop())
+
+    if len(filenames) == 0:
+        usage()
+        sys.exit(0)
+
+    if verbose:
+        printVersion()
+
+    validator = Validate(verbose, warnings)
+    for file in filenames:
+        if verbose:
+            print(f"Checking {file}")
+        validator.setFile(file)
+        validator.run()
+
+    if validator.errors > 0:
+        print(f"{validator.files_with_errors} of {validator.filecnt} files contain errors")
+        print(f"Total number of errors: {validator.errors}")
+        sys.exit(1)
+
+
+def usage():
+    printVersion()
+    print("""Validates correct use of XML tags and other possible causes of problems in po files.
+
+usage: validate_po.py [options] POFILES
+
+    options:
+        -v      --verbose       be more verbose
+        -w      --warnings      show warnings
+        -h      --help          this help""")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
\ No newline at end of file
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]