[xml] Python XML doc displayer



I'm learning libxml2 because it seems to be robust and actively
maintained.  Also, I like the rich python support.   In one of my
learning exercises, I emulate xmlDebugDumpDocument by recursively
descending the document tree.  The program is pasted below.  I ran it
against several libxml2-2.6.16/doc/*.xml files for testing. My
comments/questions:

1) How does one get the document header properties (e.g. 'version',
'standalone', the DTD refentry) via python?
2) I had a problem testing if a node has a name space.  The libxml2.py
code throws an exception when xmlNode::ns() is called and I didn't see a
test ('hasNs'?) for namespace presence.
3) Is there a better way to walk the node tree?

Dave Turvene

------------------------------ snip, snip
-------------------------------------
#!/usr/bin/env python
# XML Document Node pretty printer, see inline __doc__ for more info
# Using: libxml2-2.6.16-2, libxml2-python-2.6.16-2
#
# 041202 Dave Turvene

import os, sys, string
from optparse import OptionParser
import libxml2, libxml2mod

# global offset indicators for pretty printing
prefix = ''
prefixincr = '  '

def DispNode(self):
   """
   Display XML node, trying to match the output of 'xmllint --debug'
   or xmlDebugDumpDocument
   """

   global prefix, prefixincr

   print "%s%s" % (prefix, string.upper(self.get_type())),

   if (self.type == 'document_xml'):
       # Need to recover the version, encoding and standalone
       # properties for this document
       print "\n%sURL=%s" % (prefix, self.get_name())

   elif (self.type == 'element'):
       # Get the element

       # Hack to test if there is a namespace.  This is taken from the
       # xmlNode class code, but doesn't throw an exception with no
       # namespace
       ns = libxml2mod.xmlNodeGetNs(self._o)
       if (ns == None):
           print "%s" % (self.get_name())
       else:
           print "%s:%s" % (libxml2.xmlNs(ns).get_name(),self.get_name())

   elif (self.type == 'text'):
       # simple text node
       if (not self.content.isspace()):
           print "\n%scontent=%s" % ((prefix+prefixincr),
self.get_content())
       else:
           print "\n%scontent=" % (prefix+prefixincr)

   elif (self.type == 'attribute'):
       # attribute node, these are found by get_properties
       print self.get_name()

   elif (self.type == 'pi'):
       print self.get_name()
       print "\n%scontent=:%s:" % ((prefix+prefixincr), self.get_content())

   elif (self.type == 'dtd'):
       # Still need to work on this
       print self.get_name(), self.get_content()

   else:
       print "UNKNOWN TYPE:", self.type

def DispNodeRecurs(self):
   """recursively display properties and children of a node"""

   # global pretty printing prefix, and offset increment
   global prefix, prefixincr

   # increase the pretty printing offset upon entry
   prefix += prefixincr;

   # display my information
   DispNode(self)

   # Display properties/attributes for self,
   attr = self.get_properties()
   while(attr):
       DispNodeRecurs(attr)
       attr = attr.get_next()

   # Display children of self
   if self.lsCountNode() > 0:
       child = self.get_children()
       while child is not None:
           DispNodeRecurs(child)
           child = child.get_next()

   # unwind, so decrease the pretty printing offset
   prefix = prefix[len(prefixincr):]

def XmlTreeDisp(argv):
   """Run libxml2 parser on an XML file, then do a depth-first walk
   of the node tree.  Send the output to stdout, progress/debug messages go
   to stderr.  There is an option to run debugDumpDocument to a file for
   comparison.
   """

   # Command line parsing
   definputfile = './schema/test.xsd'
   parser = OptionParser()
   parser.add_option('-i', '--infile',
                     dest='inxmlfile',
                     default=definputfile,
                     help='XML input file, default to ' + definputfile)

   parser.add_option('-t', '--test',
                     dest='testfile',
                     help='dump xmlDebugDumpDocument to a file for
comparison')

   parser.add_option('-d', '--debug',
                     action='store_true',
                     dest='debug',
                     help='enable debugging, such as it is')

   (options, args) = parser.parse_args(argv)

   if (options.debug):
       libxml2.debugMemory(1)

   # Parse the input xml document
   doc = libxml2.parseFile(options.inxmlfile)

   # depth-first walk of the xml tree
   print >> sys.stderr, ">> Running pretty printer against ",
options.inxmlfile
   DispNodeRecurs(doc)

   # If option is set, write xmlDebugDumpDocument for comparison
   if (options.testfile != None):
       print >> sys.stderr, ">> Running debugDumpDocument, writing to ", \
             options.testfile
       fout = open(options.testfile, 'w')
       doc.debugDumpDocument(fout)
       fout.close()

   # clean myself up and get outta town
   doc.freeDoc()
   libxml2.cleanupParser()

   if (options.debug) :
       if (libxml2.debugMemory(1) != 0):
           print >> sys.stderr, "Memory leak %d bytes" %
(libxml2.debugMemory(1))
           libxml2.dumpMemory()
       else:
           print >> sys.stderr, "Memory checks OK"

if __name__ == "__main__":
   XmlTreeDisp( sys.argv )





[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]