[xml] =?iso-8859-1?q?=BFleak_using_libxml2_for_sax-parsing_html_i?= =?iso-8859-1?q?n_python=3F?=



Hi all,

I am using libxml2 for parsing html in python. I was thinking that libxml2 could be involved, so I modified one of the website python examples in order to process a revelant number of html files while I checked the memory comsuption with the top command.
And... yes! the program does increase the memory consumption till it finish.

Am I forgetting something in the code? Or there is something wrong with the python bindings....

Thank you, Cesar

Note1: I do nothing in the Callback
Note2: I have tried to use the cleanup functions after the 'ctxt = None' with the same results.

****************************************] The Code [****************************************

#!/usr/bin/python -u
import libxml2

#------------------------------------------------------------------------------


# Memory debug specific
libxml2.debugMemory(1)

#------------------------------------------------------------------------------

class callback:
    def startDocument(self):
            print "."       

    def endDocument(self):
        pass

    def startElement(self, tag, attrs):
        pass

    def endElement(self, tag):
        pass

    def characters(self, data):
        pass

    def warning(self, msg):
        pass

    def error(self, msg):
        pass

    def fatalError(self, msg):
        pass
       
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------       
import os
import sys

programName = os.path.basename(sys.argv[0])

if len(sys.argv) != 2:
  print "Use: %s <dir html files>" % programName
  sys.exit(1)
 
inputPath = sys.argv[1]
   
if not os.path.exists (inputPath):
  print "Error: directory does not exist"
  sys.exit(1)

inputFileNames = [] 
dirContent = os.listdir(inputPath)
for fichero in dirContent:   
  extension1=fichero.rfind(".htm")
  extension2=fichero.rfind(".html")
  dot = fichero.rfind(".")
  extension = max(extension1,extension2)
  if extension != -1 and extension == dot:           
      inputFileNames.append (fichero)
       
if len(inputFileNames) == 0:
  print "Error: no input files"
  sys.exit(1)
       

handler = callback()
NUM_ITERS = 5
for i in range(NUM_ITERS):
  for inputFileName in inputFileNames:
    print inputFileName
    inputFilePath = inputPath + inputFileName           
    f = open(inputFilePath)
    data = ""
    f.close()
   
    ctxt = libxml2.htmlCreatePushParser(handler, "", 0, inputFileName)       
    ctxt.htmlParseChunk(data, len(data), 1)
    ctxt = None           
   

# Memory debug specific
libxml2.cleanupParser()
if libxml2.debugMemory(1) == 0:
    print "OK"
else:
    print "Memory leak %d bytes" % (libxml2.debugMemory(1))
    libxml2.dumpMemory()
   
# Other cleanup functions   
#libxml2.cleanupCharEncodingHandlers()
#libxml2.cleanupEncodingAliases()
#libxml2.cleanupGlobals()
#libxml2.cleanupInputCallbacks()
#libxml2.cleanupOutputCallbacks()   
#libxml2.cleanupPredefinedEntities()   


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]