Re: [xml] htmlParseFile vs htmlParseDoc
- From: Nick Kew <nick webthing com>
- To: Jerome Pesenti <jpesenti yahoo com>
- Cc: <xml gnome org>
- Subject: Re: [xml] htmlParseFile vs htmlParseDoc
- Date: Tue, 21 Oct 2003 16:04:19 +0100 (BST)
On Thu, 16 Oct 2003, Jerome Pesenti wrote:
Yes, here is a little program showing the problem.
That'll be slowness parsing big chunks. Pass it smaller chunks
in memory and it's faster.
Give it a sax context and of course it'll go much faster again!
Here's your demo prog hacked to show parsing in smaller chunks.
(FWIW, I regularly use chunked SAX parsing in the Apache Filter chain;
I have a talk about using Apache as a smart markup-aware
platform at ApacheCon in November.)
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#include <libxml/HTMLparser.h>
#define SIZE (10*1024*1024)
char *
read_all(const char *name, int* sz)
{
FILE *f = fopen(name, "r");
char *s;
if (! f) {
perror(name);
exit(1);
}
s = malloc(sizeof(*s)*SIZE);
if ((*sz = fread(s, 1, SIZE, f)) < 0 || *sz == SIZE) {
/* error or file too large */
fprintf(stderr, "Could not load file, got %d bytes\n", *sz);
exit(1);
}
s[*sz] = '\0';
fclose(f) ;
return s;
}
int
main(int argc, char **argv)
{
int buflen ;
char *str;
htmlParserCtxtPtr ctxt;
size_t bytes = 0 ;
size_t count ;
str = read_all(argv[1], &count);
printf("%ld - parsing with htmlParseDocument\n", time(NULL));
ctxt = htmlCreateMemoryParserCtxt(str, strlen(str));
htmlParseDocument(ctxt);
htmlFreeParserCtxt(ctxt) ;
printf("%ld - done\n", time(NULL));
printf("%ld - parsing with htmlParseDoc\n", time(NULL));
htmlParseDoc(str, NULL);
printf("%ld - done\n", time(NULL));
printf("%ld - parsing with htmlParseFile\n", time(NULL));
htmlParseFile(argv[1], NULL);
printf("%ld - done\n", time(NULL));
for ( buflen = 4 ; buflen < count ; buflen *= 2 ) {
ctxt = NULL ;
printf("%ld - parsing with htmlParseChunk size %d\n",
time(NULL), buflen);
for ( bytes = 0 ; bytes < count - buflen ; bytes += buflen)
if ( ! ctxt )
ctxt = htmlCreatePushParserCtxt(NULL, NULL, str, buflen, 0, 0) ;
else
htmlParseChunk(ctxt, (str+bytes), buflen, 0) ;
htmlParseChunk(ctxt, (str+bytes), (count-bytes), 1) ;
htmlFreeParserCtxt(ctxt) ;
printf("%ld - done\n", time(NULL));
}
free(str) ;
return 0;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]