gimp-help-2 r2423 - in trunk: . tools
- From: ulfehlert svn gnome org
- To: svn-commits-list gnome org
- Subject: gimp-help-2 r2423 - in trunk: . tools
- Date: Sun, 30 Mar 2008 19:50:22 +0100 (BST)
Author: ulfehlert
Date: Sun Mar 30 19:50:22 2008
New Revision: 2423
URL: http://svn.gnome.org/viewvc/gimp-help-2?rev=2423&view=rev
Log:
2008-03-30 Ulf-D. Ehlert <ulfehlert svn gnome org>
* tools/README
* tools/validate_references.py: replace DOM-based parsing with
SAX-based parsing, which will speed up the script
Modified:
trunk/ChangeLog
trunk/tools/README
trunk/tools/validate_references.py
Modified: trunk/tools/README
==============================================================================
--- trunk/tools/README (original)
+++ trunk/tools/README Sun Mar 30 19:50:22 2008
@@ -165,8 +165,6 @@
Requirements:
- pyxml from http://pyxml.sourceforge.net
- or
- - lxml from http://codespeak.net/lxml/
Call the script from the gimp-help-2 root:
Modified: trunk/tools/validate_references.py
==============================================================================
--- trunk/tools/validate_references.py (original)
+++ trunk/tools/validate_references.py Sun Mar 30 19:50:22 2008
@@ -1,8 +1,8 @@
#!/usr/bin/env python
# _*_ coding: latin1 -*_
-
+#
# gimp-help-2 -- Validate image file references
-# Copyright (C) 2006, 2007, 2008 Ró Joost
+# Copyright (C) 2006, 2007, 2008 RÃman Joost
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -18,251 +18,90 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
+
import sys
import os
import getopt
import re
-import unittest
-import doctest
-import StringIO
-
-try:
- import lxml.etree
- HAVE_LXML = True
-except ImportError:
- HAVE_LXML = False
-
-try:
- import xml.xpath
- import xml.dom.minidom
- HAVE_XML = True
-except ImportError:
- HAVE_XML = False
-
-# xpath expressions of filereferences in a DocBook XML file
-REFERENCESTOTEST = ['//imagedata[ fileref]', '//graphic[ fileref]',
- '//inlinegraphic[ fileref]']
-
-# check only xml files
-XMLFILE_EXP = re.compile('[\w-]*\.xml$')
-
-# check only png and jpg files
-IMAGEFILE_EXP = re.compile('[\w-]*\.(png|jpg)$')
-
-
-class XMLReferenceValidator(object):
- """A validator to validate filereferences in a DocBook complient
- XML file.
- """
-
- def __init__(self, xpath_expr, filepath=None, xmlstr=None):
- self.xpath_expr = xpath_expr
- self.filepath = filepath
- self.xmlstr = xmlstr
- self.img_references = []
- self.invalid = []
+import xml.sax
- def get_imagefp(self, fileref):
- """Creates the absolute filepath.
- The fileref is an attribute value from a gimp-help-2 compliant
- DocBook/XML file.
- """
- mangeled = fileref.split('/')[1:]
- base = os.curdir
- imagefp = "/".join(mangeled)
- # now put everything together
- imagefp = os.path.join(base, imagefp)
-
- # save the filename in our list
- if imagefp not in self.img_references:
- self.img_references.append(imagefp)
-
- return imagefp
-
- def _validation_helper(self, imagefp):
- """Helper method, which checks if the given imagefilepath is
- correct.
- """
- imagefp = imagefp.encode()
- if not os.path.exists(imagefp) and self.filepath is not None:
- return (self.filepath, imagefp)
- elif not os.path.exists(imagefp) and self.xmlstr is not None:
- return (0, imagefp)
-
- def validate_imagepath_references(self):
- raise ValueError("Implemented in sublcasses.")
-
-
-class LxmlValidator(XMLReferenceValidator):
- """A validator to validate filereferences in a DocBook complient
- XML file.
-
- >>> str = '<sect1><imagedata '\
- 'fileref="../images/toolbox/toolbox-flip.png" /></sect1>'
- >>> val = LxmlValidator(REFERENCESTOTEST[0], xmlstr=str)
- >>> val.validate_imagepath_references()
- []
-
- Create a scrambled reference and check if the validator throws an
- error.
- >>> str = '<sect1><imagedata '\
- 'fileref="../foobar/toolbox/toolbox-flip.png" /></sect1>'
- >>> val = LxmlValidator(REFERENCESTOTEST[0], xmlstr=str)
- >>> val.validate_imagepath_references()
- [(0, './foobar/toolbox/toolbox-flip.png')]
-
- >>> str = '<sect2><graphic '\
- 'fileref="../foobar/math/dot-for-dot.png" /></sect2>'
- >>> val = LxmlValidator(REFERENCESTOTEST[1], xmlstr=str)
- >>> val.validate_imagepath_references()
- [(0, './foobar/math/dot-for-dot.png')]
- """
+# Nodes containing filereferences in a DocBook XML file
+IMAGE_NODES = ["imagedata", "graphic", "inlinegraphic"]
- def get_elements_by_xpath(self):
- """Returns the elements to test as a list."""
- if self.xmlstr is not None:
- doc = lxml.etree.parse(StringIO.StringIO(self.xmlstr))
- else:
- try:
- doc = lxml.etree.parse(open(self.filepath))
- except:
- print self.filepath
- return doc.xpath(self.xpath_expr)
+# Regular expression for image files to be checked (png and jpg files only)
+IMAGEFILE_REGEX = re.compile('[\w+-]*\.(png|jpg)$')
+# Regular expression for image files to be skipped
+IGNORE_IMAGE_REGEX = re.compile('callout')
- def validate_imagepath_references(self):
- """Validates all references
+class FileNameContainer(object):
+ """A special purpose container base class.
- returns a tuple (xmlfilepath, imagefilepath) if the reference
- is broken
- """
- elements = self.get_elements_by_xpath()
+ This class stores filenames and provides some basic
+ access methods common to both derived List classes.
- for elm in elements:
- # mangle the filepath
- fileref = elm.get('fileref')
- imagefp = self.get_imagefp(fileref)
-
- result = self._validation_helper(imagefp)
- if result is not None:
- self.invalid.append(result)
-
- return self.invalid
-
-
-class LibXMLValidator(XMLReferenceValidator):
- """ It is not important to have a valid DocBook/XML file here. We
- just test, if the method pics up the correct filepath and
- validates it.
- >>> str = '<sect1><imagedata '\
- 'fileref="../images/toolbox/toolbox-flip.png" /></sect1>'
- >>> val = LibXMLValidator(REFERENCESTOTEST[0], xmlstr=str)
- >>> val.validate_imagepath_references()
- []
-
- Create a scrambled reference and check if the validator throws an
- error.
- >>> str = '<sect1><imagedata '\
- 'fileref="../foobar/toolbox/toolbox-flip.png" /></sect1>'
- >>> val = LibXMLValidator(REFERENCESTOTEST[0], xmlstr=str)
- >>> val.validate_imagepath_references()
- [(0, './foobar/toolbox/toolbox-flip.png')]
-
- >>> str = '<sect2><graphic '\
- 'fileref="../foobar/math/dot-for-dot.png" /></sect2>'
- >>> val = LibXMLValidator(REFERENCESTOTEST[1], xmlstr=str)
- >>> val.validate_imagepath_references()
- [(0, './foobar/math/dot-for-dot.png')]
"""
+ def __init__(self, verbose=0):
+ self.verbose = verbose
+ self.data = {}
- def get_elements_by_xpath(self):
- """Returns the elements to test as a list."""
- if self.xmlstr is not None:
- dom = xml.dom.minidom.parseString(self.xmlstr)
- else:
- dom = xml.dom.minidom.parse(self.filepath)
- return xml.xpath.Evaluate(self.xpath_expr, dom)
+ def contains(self, key):
+ """Is there an entry 'key'?."""
+ return self.data.has_key(key)
+
+ def add(self, key, val=True):
+ """Add a (key, value) pair to the container, e.g.
+ filename and some info corresponding to this filename.
+ """
+ self.data[key] = val
- def validate_imagepath_references(self):
- """Validates all references
+ def remove(self, key):
+ """Mark an entry as invalid, i.e. set the info to False,
+ so that this method may be used with an iterator.
+ """
+ if self.data.has_key(key):
+ self.data[key] = False
- returns a tuple (xmlfilepath, imagefilepath) if the reference
- is broken
+ def size(self):
+ """Return the number of data entries."""
+ return len(self.data)
+
+ def sort_out_valid(self, other):
+ """Same as 'difference(self,other)'. When common entries
+ have been removed, the remaining filenames are just
+ the orphaned files, or the broken links, respectively.
"""
- for elm in self.get_elements_by_xpath():
- fileref = elm.getAttribute('fileref')
- imagefp = self.get_imagefp(fileref)
+ self.difference(other)
- result = self._validation_helper(imagefp)
- if result is not None:
- self.invalid.append(result)
+ def difference(self, other):
+ """Remove entries common to 'self' and 'other'."""
+ for key in self.data.iterkeys():
+ if other.contains(key):
+ self.remove(key)
+ other.remove(key)
+ self.data = dict((key, self.data[key])
+ for key in self.data if self.data.get(key))
- return self.invalid
+class ImageFilesList(FileNameContainer):
+ """A container for image file names.
-class FileLookup(object):
- """Runs through each directory of <gimp_help_root> and checks each
- xml file it can find.
+ This class is used to collect and save all image files
+ in the gimp-help-2 'images/' directory.
- >>> fl = FileLookup(gimp_help_root='src/toolbox')
- >>> fl.get_image_root()
- 'images'
"""
+ def __init__(self, verbose=0):
+ super(ImageFilesList, self).__init__(verbose)
- def __init__(self, verbose=0, absolute=1, gimp_help_root='.'):
- self.verbose = verbose
- self.absolute = absolute
- self.gimp_help_root = gimp_help_root
-
- self.all_img_references = []
- self.brokenimages = []
-
- def get_image_root(self):
- """Returns the absolute path to the images directory
- """
- result = None
- root = self.gimp_help_root
- _head, _tail = os.path.split(root)
-
- # if we are already in the gimp-help-root we don't need to do
- # the traversal
- if os.path.exists(os.path.join(root, 'images')) and\
- os.path.exists(os.path.join(root, 'src')):
- return os.path.join(root, 'images')
-
- while root:
- # if we hit the gimp_help_root, we need to check if an
- # 'images' dir exist
- if os.path.exists(os.path.join(_head, 'images')) and not\
- _head.endswith('src'):
- result = os.path.join(_head, 'images')
- break
-
- root = _head
- _head, _tail = os.path.split(root)
-
- if not _tail:
- break
-
- return result
-
- def validate_imagefiles(self):
- """checks if each image file is referenced in the XML files"""
- imageroot = self.get_image_root()
-
- if not self.all_img_references:
- return
-
- if imageroot is None:
- sys.stderr.write("The path you specified or the directory"\
- " you're in do not contain an 'images"\
- " directory.\n")
- return
+ def find(self, imageroot = "images"):
+ """Search for PNG and JPG files in the image directory."""
+ if self.verbose:
+ sys.stderr.write("searching images ... ")
+ if self.verbose > 1:
+ sys.stderr.write("\n")
for root, dirs, files in os.walk(imageroot):
- # XXX this filtering of dirs is awkward, but I couldn't come
- # up with a better method yet
for prune in [ 'callouts', '.svn' ]:
if prune in dirs:
dirs.remove(prune)
@@ -272,129 +111,240 @@
continue
# don't care about other files than images files
- for file in filter(IMAGEFILE_EXP.match, files):
- filepath = os.path.join(root, file)
- if filepath not in self.all_img_references:
- sys.stdout.write(filepath + "\n")
-
- def validate_refs(self):
- """walks to each xml file directory, reads each xml file and
- validates the references
+ for filename in (name for name in files
+ if IMAGEFILE_REGEX.match(name)):
+ filepath = os.path.join(root, filename)
+ if self.verbose > 1:
+ sys.stderr.write(filepath + '\n')
+ self.add(filepath.replace("images/", ""))
+
+ if self.verbose:
+ sys.stderr.write(str(len(self.data)) + "\n")
+
+ def report(self):
+ """Print the list of orphaned image files, i.e. image files
+ which are not referenced in the XML source files.
"""
- top = os.path.join(self.gimp_help_root, 'src')
- for root, dirs, files in os.walk(top):
- # don't visit .svn directories
- if '.svn' in dirs:
- dirs.remove('.svn')
-
- if self.verbose:
- sys.stdout.write("Checking %s\n" %root)
-
- # don't care about other files than xml files
- for file in filter(XMLFILE_EXP.match, files):
-
- # puzzle together the relative filepath
- xml_filepath = os.path.join(root, file)
-
- for xpathexpr in REFERENCESTOTEST:
- if HAVE_LXML:
- val = LxmlValidator(xpathexpr,
- xml_filepath)
- else:
- val = LibXMLValidator(xpathexpr,
- xml_filepath)
-
- result = val.validate_imagepath_references()
- if result is not []:
- self.brokenimages.append(result)
-
- # XXX thats kinda stupid, because we have two lists
- # which save the filepaths of the images
- self.all_img_references += val.img_references
-
- def print_broken_imagefilepaths(self):
- """Prints out which xml files have broken references to
- images.
+ if self.verbose:
+ sys.stderr.write(str(self.size()) + " orphaned image file")
+ if self.size() != 1: sys.stderr.write("s")
+ if self.size() != 0: sys.stderr.write(":")
+ sys.stderr.write("\n")
+ for imagefile in sorted(self.data.keys()):
+ print "ORPHANED:", "images/" + imagefile
+
+
+class ImageReferencesList(FileNameContainer):
+ """A container for image file references.
+
+ This class is used to collect and save all image file
+ references in the XML source files, i.e. it saves
+ ('image-file', 'source-file') pairs.
+
+ """
+ def __init__(self, source, verbose=0):
+ super(ImageReferencesList, self).__init__(verbose)
+ self.source = source
+ self.cur_files = [] # stack for files in progress
+ self.all_files = 0 # visited files
+ self.handler = XMLHandler(self)
+ self.parser = self.make_parser()
+
+ def make_parser(self):
+ """Create and return an initialized SAX XMLReader object,
+ i.e. an XML parser. A content handler is attached to
+ the parser and some appropriate features are set.
+ """
+ parser = xml.sax.make_parser()
+ parser.setContentHandler(self.handler)
+ parser.setFeature(xml.sax.handler.feature_namespaces, 0)
+ parser.setFeature(xml.sax.handler.feature_external_ges, 0)
+ parser.setFeature(xml.sax.handler.feature_external_pes, 0)
+ return parser
+
+ def find(self):
+ """Parse XML files and extract image references."""
+
+ if self.verbose:
+ sys.stderr.write("parsing XML files ... ")
+ if self.verbose > 1: sys.stderr.write("\n")
+
+ self.push_file(self.source)
+ self.parser.parse(self.source)
+
+ assert(len(self.cur_files) == 1)
+
+ if self.verbose:
+ if self.verbose > 1: sys.stderr.write("parsed ")
+ sys.stderr.write(str(self.all_files) + " files, " +
+ str(self.size()) + " references\n")
+
+ def report(self):
+ """Print the list of broken image referencess
+ in the XML source file(s).
"""
- for itemlist in self.brokenimages:
- for item in itemlist:
- if self.absolute:
- item = os.path.abspath(item)
- if self.verbose:
- errormsg = "File %s \ncontains invalid references"\
- " to\n%s\n\n" %(item)
- else:
- errormsg = "%s invalid: <%s>\n" %(item)
- sys.stdout.write(errormsg)
+ if self.verbose:
+ sys.stderr.write(str(self.size()) + " broken image reference")
+ if self.size() != 1: sys.stderr.write("s")
+ if self.size() != 0: sys.stderr.write(":")
+ sys.stderr.write("\n")
+ for imagefile in sorted(self.data.keys()):
+ print "BROKEN:", imagefile, "IN", self.data[imagefile]
+
+ # Internal stack methods to keep track of the opened files
+
+ def current_file(self):
+ """The file currently parsed."""
+ return self.cur_files[-1] # top of filenames stack
+
+ def push_file(self, filename):
+ """Add entry to internal stack of filenames."""
+ self.cur_files.append(filename)
+ self.all_files += 1
+
+ def pop_file(self):
+ """Remove top entry from internal stack of filenames."""
+ return self.cur_files.pop()
+
+
+class XMLHandler(xml.sax.handler.ContentHandler):
+ """A content handler class as defined by the SAX API."""
+ def __init__(self, owner):
+ #super(XMLHandler, self).__init__()
+ self.owner = owner
+ def startElement(self, name, attrs):
+ """Handle image nodes."""
+ if name in IMAGE_NODES:
+ fileref = attrs.getValue('fileref').replace("../images/", "")
+ if not IGNORE_IMAGE_REGEX.match(fileref):
+ self.owner.add(fileref, self.owner.current_file())
+ if name == "xi:include" and attrs.has_key('href'):
+ filename = os.path.join(os.path.dirname(self.owner.current_file()),
+ attrs.getValue('href'))
+ if self.owner.verbose > 1:
+ sys.stderr.write("parsing " + str(filename) + "\n")
+ self.owner.push_file(filename)
+ parser = self.owner.make_parser()
+ parser.parse(filename)
+ self.owner.pop_file()
-def run_doctests():
- doctest.testmod()
def main():
- verbose = 0
- absolute = 0
- gimp_help_root = os.curdir
+ """The main program (hmm, what did you expect?).
+
+ The algorithm for validating image files and references is
+ very simple:
+
+ Let
+ (1) I := (set of) all image files,
+ (2) R := (set of) all image file references,
+ then
+ (3) B := R \ I = R \ (R â I)
+ is the set containing files in R but not in I, that is the set
+ of broken references,
+ (4) O := I \ R = I \ (R â I)
+ is the set containing files in I but not in R, that is the set
+ of images not referenced in the XML files (orphaned images).
+ """
+ verbose = 0
+ gimp_help_root_dir = "."
+ xml_root_file = "src/gimp.xml"
+ find_orphaned_images = False
+ find_broken_references = False
try:
- opts, args = getopt.getopt(sys.argv[1:], "hivatf:x:")
+ opts, args = getopt.getopt(sys.argv[1:], "hvr:bliof:",
+ ["help", "verbose", "root", "broken",
+ "links", "orphaned", "images", "file"])
except getopt.GetoptError:
- usage()
- sys.exit(2)
+ usage(64)
- for o, a in opts:
- if o == "-h":
+ for opt, arg in opts:
+ if opt == "-h" or opt == "--help":
usage()
- sys.exit(0)
- if o == "-v":
- verbose = 1
- if o == "-a":
- absolute = 1
- if o == "-t":
- run_doctests()
- sys.exit(0)
- if o == "-f":
- result = []
- for xpath_expr in REFERENCESTOTEST:
- if HAVE_LXML:
- val = LxmlValidator(xpath_expr, a)
- elif HAVE_XML:
- val = LibXMLValidator(xpath_expr, a)
- else:
- sys.exit(1)
- result += val.validate_imagepath_references()
-
- for r in result:
- print "%s invalid: %s" %(r)
- sys.exit(1)
-
- if o == "-x":
- gimp_help_root = a
- if o == "-i":
- filelookup = FileLookup(verbose, absolute, gimp_help_root)
- filelookup.validate_refs()
- filelookup.validate_imagefiles()
- sys.exit(1)
-
- filelookup = FileLookup(verbose, absolute, gimp_help_root)
- filelookup.validate_refs()
- filelookup.print_broken_imagefilepaths()
- sys.exit(1)
-
-def usage():
+ elif opt == "-v" or opt == "--verbose":
+ verbose = verbose + 1
+ elif opt in ["-r", "--root"]:
+ gimp_help_root_dir = arg
+ elif opt in ["-b", "-l", "--broken", "--links"]:
+ find_broken_references = True
+ elif opt in ["-i", "-o", "--orphaned", "--images"]:
+ find_orphaned_images = True
+ elif opt == "-f" or opt == "--file":
+ find_broken_references = True
+ xml_root_file = arg
+
+ # Change to user specified root dir.
+ if gimp_help_root_dir != ".":
+ try:
+ os.chdir(gimp_help_root_dir)
+ except OSError, (errno, strerror):
+ sys.stderr.write("Error: " + strerror + ": " + \
+ gimp_help_root_dir +"\n")
+ sys.exit(errno)
+
+ # Check for the correct directory.
+ if not (os.path.isdir("images/") and os.path.isdir("src/")):
+ usage(66, "This script must be called from the " +
+ "gimp-help-2 root directory.")
+
+ # We need an existing xml source file to parse.
+ if not os.path.isfile(xml_root_file):
+ usage(66, "Cannot find " + xml_root_file + ".")
+
+ # When finding orphaned images, we must parse all xml files.
+ if find_orphaned_images and (xml_root_file != "src/gimp.xml"):
+ usage(64, "'--file <file>' and '--orphaned' are mutually exclusive.")
+
+ # If no action specified: search for broken image references.
+ if not (find_orphaned_images or find_broken_references):
+ find_broken_references = True
+
+ # Step 1: find all image files.
+ image_files = ImageFilesList(verbose)
+ image_files.find()
+
+ # Step 2: find all image references.
+ image_refs = ImageReferencesList(xml_root_file, verbose)
+ image_refs.find()
+
+ # Step 3: remove intersection of image references and images files,
+ # the result is the list of invalid (broken) references.
+ if find_broken_references:
+ image_refs.sort_out_valid(image_files)
+ image_refs.report()
+
+ # Step 4: remove intersection of image references and images files,
+ # the result is the list of orphaned image files.
+ if find_orphaned_images:
+ image_files.sort_out_valid(image_refs)
+ image_files.report()
+
+
+def usage(exitcode=0, msg=""):
+ """Help the user."""
+ if msg:
+ sys.stderr.write("Error: " + msg + "\n")
+ else:
+ sys.stderr.write ( """\
+validate_references - Copyright (C) 2006-2008 RÃman Joost (gimp-help-2)
+validates file references in docbook xml files.\n""")
sys.stderr.write ( """\
-validate_references - Copyright 2006 Roman Joost (gimp-help-2)
-validates file references in docbook xml files.
usage: validate_references.py [options]
options:
- -h this help
- -t run doctests
- -v verbose
- -a print relative paths as absolute paths
- -i check for orphaned image files
- -f <file> check only <file>
- -x <path> specify another root of xml files \n""")
+ -h | --help this help
+ -v | --verbose verbose; doubling (-v -v) is possible
+ -r <dir> specify the gimp-hel-2 root directory
+ --root <dir> same as '-r'
+ -o | --orphaned check for orphaned image files
+ -i | --images same as '-o'
+ -b | --broken check for broken links
+ (this is the default action)
+ -f <file> check only <file>
+ (implies '-b', conflicts with '-o')\n""")
+ sys.exit(exitcode)
if __name__ == "__main__":
main()
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]