Re: [xml] htmlParseFile vs htmlParseDoc



On Thu, 16 Oct 2003, Jerome Pesenti wrote:

Yes, here is a little program showing the problem.

That'll be slowness parsing big chunks.  Pass it smaller chunks
in memory and it's faster.

Give it a sax context and of course it'll go much faster again!


Here's your demo prog hacked to show parsing in smaller chunks.

(FWIW, I regularly use chunked SAX parsing in the Apache Filter chain;
I have a talk about using Apache as a smart markup-aware
platform at ApacheCon in November.)


#include <stdlib.h>
#include <time.h>
#include <stdio.h>

#include <libxml/HTMLparser.h>

#define SIZE (10*1024*1024)

char *
read_all(const char *name, int* sz)
{
    FILE *f = fopen(name, "r");
    char *s;

    if (! f) {
        perror(name);
        exit(1);
    }

    s = malloc(sizeof(*s)*SIZE);
    if ((*sz = fread(s, 1, SIZE, f)) < 0 || *sz == SIZE) {
        /* error or file too large */
        fprintf(stderr, "Could not load file, got %d bytes\n", *sz);
        exit(1);
    }

    s[*sz] = '\0';

    fclose(f) ;
    return s;
}

int
main(int argc, char **argv)
{
    int buflen ;
    char *str;
    htmlParserCtxtPtr ctxt;
    size_t bytes = 0 ;
    size_t count ;

    str = read_all(argv[1], &count);

    printf("%ld - parsing with htmlParseDocument\n", time(NULL));
    ctxt = htmlCreateMemoryParserCtxt(str, strlen(str));
    htmlParseDocument(ctxt);
    htmlFreeParserCtxt(ctxt) ;
    printf("%ld - done\n", time(NULL));
    printf("%ld - parsing with htmlParseDoc\n", time(NULL));
    htmlParseDoc(str, NULL);
    printf("%ld - done\n", time(NULL));
    printf("%ld - parsing with htmlParseFile\n", time(NULL));
    htmlParseFile(argv[1], NULL);
    printf("%ld - done\n", time(NULL));

    for ( buflen = 4 ; buflen < count ; buflen *= 2 ) {
      ctxt = NULL ;
      printf("%ld - parsing with htmlParseChunk size %d\n",
                time(NULL), buflen);
      for ( bytes = 0 ; bytes < count - buflen ; bytes += buflen)
        if ( ! ctxt )
          ctxt = htmlCreatePushParserCtxt(NULL, NULL, str, buflen, 0, 0) ;
        else
          htmlParseChunk(ctxt, (str+bytes), buflen, 0) ;
      htmlParseChunk(ctxt, (str+bytes), (count-bytes), 1) ;
      htmlFreeParserCtxt(ctxt) ;
      printf("%ld - done\n", time(NULL));
    }
    free(str) ;

    return 0;
}




[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]