[libxml2] Consolidate seed corpus generation
- From: Nick Wellnhofer <nwellnhof src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libxml2] Consolidate seed corpus generation
- Date: Mon, 24 Aug 2020 19:20:32 +0000 (UTC)
commit 0d5f3710fb554619e75d473c2126cc6649cad941
Author: Nick Wellnhofer <wellnhofer aevum de>
Date: Mon Aug 24 16:28:54 2020 +0200
Consolidate seed corpus generation
Implement file handling in C to speed up corpus generation.
fuzz/.gitignore | 5 +-
fuzz/Makefile.am | 76 +++-------
fuzz/fuzz.c | 48 +------
fuzz/fuzz.h | 3 -
fuzz/genSeed.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fuzz/htmlSeed.c | 36 -----
fuzz/schemaSeed.c | 34 -----
fuzz/xmlSeed.c | 32 -----
fuzz/xpathSeed.c | 157 ---------------------
9 files changed, 432 insertions(+), 366 deletions(-)
---
diff --git a/fuzz/.gitignore b/fuzz/.gitignore
index 92e291350..02c74b112 100644
--- a/fuzz/.gitignore
+++ b/fuzz/.gitignore
@@ -1,9 +1,8 @@
corpus/
+genSeed
html
-htmlSeed
regexp
schema
-schemaSeed
seed/html*
seed/schema*
seed/xml*
@@ -11,6 +10,4 @@ seed/xpath*
testFuzzer
uri
xml
-xmlSeed
xpath
-xpathSeed
diff --git a/fuzz/Makefile.am b/fuzz/Makefile.am
index 7f0bcef76..49b955413 100644
--- a/fuzz/Makefile.am
+++ b/fuzz/Makefile.am
@@ -1,6 +1,5 @@
AUTOMAKE_OPTIONS = -Wno-syntax
-EXTRA_PROGRAMS = html htmlSeed regexp uri schema schemaSeed xml xmlSeed \
- xpath xpathSeed
+EXTRA_PROGRAMS = genSeed html regexp schema uri xml xpath
check_PROGRAMS = testFuzzer
CLEANFILES = $(EXTRA_PROGRAMS)
AM_CPPFLAGS = -I$(top_srcdir)/include
@@ -8,16 +7,17 @@ DEPENDENCIES = $(top_builddir)/libxml2.la
LDADD = $(STATIC_BINARIES) $(top_builddir)/libxml2.la $(THREAD_LIBS) $(Z_LIBS) $(LZMA_LIBS) $(ICONV_LIBS)
$(M_LIBS) $(WIN32_EXTRA_LIBADD)
XML_MAX_LEN = 80000
+# Single quotes to avoid wildcard expansion by the shell
XML_SEED_CORPUS_SRC = \
- $(top_srcdir)/test/* \
- $(top_srcdir)/test/errors/*.xml \
- $(top_srcdir)/test/errors10/*.xml \
- $(top_srcdir)/test/namespaces/* \
- $(top_srcdir)/test/valid/*.xml \
- $(top_srcdir)/test/VC/* \
- $(top_srcdir)/test/VCM/* \
- $(top_srcdir)/test/XInclude/docs/* \
- $(top_srcdir)/test/xmlid/*
+ '$(top_srcdir)/test/*' \
+ '$(top_srcdir)/test/errors/*.xml' \
+ '$(top_srcdir)/test/errors10/*.xml' \
+ '$(top_srcdir)/test/namespaces/*' \
+ '$(top_srcdir)/test/valid/*.xml' \
+ '$(top_srcdir)/test/VC/*' \
+ '$(top_srcdir)/test/VCM/*' \
+ '$(top_srcdir)/test/XInclude/docs/*' \
+ '$(top_srcdir)/test/xmlid/*'
testFuzzer_SOURCES = testFuzzer.c fuzz.c
@@ -35,25 +35,15 @@ clean-corpus:
rm -rf seed/xml.stamp seed/xml
rm -rf seed/xpath.stamp seed/xpath
-# XML fuzzer
+# Seed corpus
+
+genSeed_SOURCES = genSeed.c fuzz.c
-xmlSeed_SOURCES = xmlSeed.c fuzz.c
+# XML fuzzer
-seed/xml.stamp: xmlSeed$(EXEEXT)
+seed/xml.stamp: genSeed$(EXEEXT)
@mkdir -p seed/xml
- @for i in $(XML_SEED_CORPUS_SRC); do \
- if [ -f $$i ]; then \
- echo Processing seed $$i; \
- base=$$(basename $$i) \
- outfile=$(abs_builddir)/seed/xml/$$base; \
- pushd $$(dirname $$i) >/dev/null; \
- $(abs_builddir)/xmlSeed$(EXEEXT) $$base > $$outfile; \
- popd >/dev/null; \
- if [ "$$(wc -c < $$outfile)" -gt $(XML_MAX_LEN) ]; then \
- rm $$outfile; \
- fi; \
- fi; \
- done
+ @./genSeed$(EXEEXT) xml $(XML_SEED_CORPUS_SRC)
@touch seed/xml.stamp
xml_SOURCES = xml.c fuzz.c
@@ -69,16 +59,9 @@ fuzz-xml: xml$(EXEEXT) seed/xml.stamp
# HTML fuzzer
-htmlSeed_SOURCES = htmlSeed.c fuzz.c
-
-seed/html.stamp: htmlSeed$(EXEEXT)
+seed/html.stamp: genSeed$(EXEEXT)
@mkdir -p seed/html
- @for i in $(top_srcdir)/test/HTML/*; do \
- if [ -f $$i ]; then \
- echo Processing seed $$i; \
- ./htmlSeed$(EXEEXT) $$i > seed/html/$$(basename $$i); \
- fi; \
- done
+ @./genSeed$(EXEEXT) html '$(top_srcdir)/test/HTML/*'
@touch seed/html.stamp
html_SOURCES = html.c fuzz.c
@@ -119,20 +102,9 @@ fuzz-uri: uri$(EXEEXT)
# XML Schema fuzzer
-schemaSeed_SOURCES = schemaSeed.c fuzz.c
-
-seed/schema.stamp: schemaSeed$(EXEEXT)
+seed/schema.stamp: genSeed$(EXEEXT)
@mkdir -p seed/schema
- @for i in ../test/schemas/*.xsd; do \
- if [ -f $$i ]; then \
- echo Processing seed $$i; \
- base=$$(basename $$i) \
- outfile=$(abs_builddir)/seed/schema/$$base; \
- pushd $$(dirname $$i) >/dev/null; \
- $(abs_builddir)/schemaSeed$(EXEEXT) $$base > $$outfile; \
- popd >/dev/null; \
- fi; \
- done
+ @./genSeed$(EXEEXT) schema '$(top_srcdir)/test/schemas/*.xsd'
@touch seed/schema.stamp
schema_SOURCES = schema.c fuzz.c
@@ -148,11 +120,9 @@ fuzz-schema: schema$(EXEEXT) seed/schema.stamp
# XPath fuzzer
-xpathSeed_SOURCES = xpathSeed.c fuzz.c
-
-seed/xpath.stamp: xpathSeed$(EXEEXT)
+seed/xpath.stamp: genSeed$(EXEEXT)
@mkdir -p seed/xpath
- @./xpathSeed$(EXEEXT) "$(top_builddir)/test/XPath"
+ @./genSeed$(EXEEXT) xpath "$(top_builddir)/test/XPath"
@touch seed/xpath.stamp
xpath_SOURCES = xpath.c fuzz.c
diff --git a/fuzz/fuzz.c b/fuzz/fuzz.c
index 543235c48..b5dfa185f 100644
--- a/fuzz/fuzz.c
+++ b/fuzz/fuzz.c
@@ -72,11 +72,6 @@ xmlFuzzDataInit(const char *data, size_t size) {
fuzzData.mainEntity = NULL;
}
-static void
-xmlFreeEntityEntry(void *value, const xmlChar *name) {
- xmlFree(value);
-}
-
/**
* xmlFuzzDataFree:
*
@@ -85,7 +80,7 @@ xmlFreeEntityEntry(void *value, const xmlChar *name) {
void
xmlFuzzDataCleanup(void) {
xmlFree(fuzzData.outBuf);
- xmlHashFree(fuzzData.entities, xmlFreeEntityEntry);
+ xmlHashFree(fuzzData.entities, xmlHashDefaultDeallocator);
}
/**
@@ -193,47 +188,6 @@ xmlFuzzReadString(size_t *size) {
return(NULL);
}
-/*
- * A custom entity loader that writes all external DTDs or entities to a
- * single file in the format expected by xmlFuzzEntityLoader.
- */
-xmlParserInputPtr
-xmlFuzzEntityRecorder(const char *URL, const char *ID,
- xmlParserCtxtPtr ctxt) {
- xmlParserInputPtr in;
- static const int chunkSize = 16384;
- int len;
-
- in = xmlNoNetExternalEntityLoader(URL, ID, ctxt);
- if (in == NULL)
- return(NULL);
-
- if (fuzzData.entities == NULL) {
- fuzzData.entities = xmlHashCreate(4);
- } else if (xmlHashLookup(fuzzData.entities,
- (const xmlChar *) URL) != NULL) {
- return(in);
- }
-
- do {
- len = xmlParserInputBufferGrow(in->buf, chunkSize);
- if (len < 0) {
- fprintf(stderr, "Error reading %s\n", URL);
- xmlFreeInputStream(in);
- return(NULL);
- }
- } while (len > 0);
-
- xmlFuzzWriteString(stdout, URL);
- xmlFuzzWriteString(stdout, (char *) xmlBufContent(in->buf->buffer));
-
- xmlFreeInputStream(in);
-
- xmlHashAddEntry(fuzzData.entities, (const xmlChar *) URL, NULL);
-
- return(xmlNoNetExternalEntityLoader(URL, ID, ctxt));
-}
-
/**
* xmlFuzzReadEntities:
*
diff --git a/fuzz/fuzz.h b/fuzz/fuzz.h
index c39fa65e6..8716af93e 100644
--- a/fuzz/fuzz.h
+++ b/fuzz/fuzz.h
@@ -43,9 +43,6 @@ xmlFuzzWriteString(FILE *out, const char *str);
const char *
xmlFuzzReadString(size_t *size);
-xmlParserInputPtr
-xmlFuzzEntityRecorder(const char *URL, const char *ID, xmlParserCtxtPtr ctxt);
-
void
xmlFuzzReadEntities(void);
diff --git a/fuzz/genSeed.c b/fuzz/genSeed.c
new file mode 100644
index 000000000..68fb87a1b
--- /dev/null
+++ b/fuzz/genSeed.c
@@ -0,0 +1,407 @@
+/*
+ * xmlSeed.c: Generate the XML seed corpus for fuzzing.
+ *
+ * See Copyright for the status of this software.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <glob.h>
+#include <libgen.h>
+#include <sys/stat.h>
+
+#ifdef _WIN32
+#include <direct.h>
+#else
+#include <unistd.h>
+#endif
+
+#include <libxml/parser.h>
+#include <libxml/parserInternals.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/xinclude.h>
+#include <libxml/xmlschemas.h>
+#include "fuzz.h"
+
+#define PATH_SIZE 500
+#define SEED_BUF_SIZE 16384
+#define EXPR_SIZE 4500
+
+typedef int
+(*fileFunc)(const char *base, FILE *out);
+
+typedef int
+(*mainFunc)(const char *arg);
+
+static struct {
+ FILE *out;
+ xmlHashTablePtr entities; /* Maps URLs to xmlFuzzEntityInfos */
+ xmlExternalEntityLoader oldLoader;
+ fileFunc processFile;
+ const char *fuzzer;
+ int counter;
+ char cwd[PATH_SIZE];
+} globalData;
+
+/*
+ * A custom entity loader that writes all external DTDs or entities to a
+ * single file in the format expected by xmlFuzzEntityLoader.
+ */
+static xmlParserInputPtr
+fuzzEntityRecorder(const char *URL, const char *ID,
+ xmlParserCtxtPtr ctxt) {
+ xmlParserInputPtr in;
+ static const int chunkSize = 16384;
+ int len;
+
+ in = xmlNoNetExternalEntityLoader(URL, ID, ctxt);
+ if (in == NULL)
+ return(NULL);
+
+ if (globalData.entities == NULL) {
+ globalData.entities = xmlHashCreate(4);
+ } else if (xmlHashLookup(globalData.entities,
+ (const xmlChar *) URL) != NULL) {
+ return(in);
+ }
+
+ do {
+ len = xmlParserInputBufferGrow(in->buf, chunkSize);
+ if (len < 0) {
+ fprintf(stderr, "Error reading %s\n", URL);
+ xmlFreeInputStream(in);
+ return(NULL);
+ }
+ } while (len > 0);
+
+ xmlFuzzWriteString(globalData.out, URL);
+ xmlFuzzWriteString(globalData.out,
+ (char *) xmlBufContent(in->buf->buffer));
+
+ xmlFreeInputStream(in);
+
+ xmlHashAddEntry(globalData.entities, (const xmlChar *) URL, NULL);
+
+ return(xmlNoNetExternalEntityLoader(URL, ID, ctxt));
+}
+
+static void
+fuzzRecorderInit(FILE *out) {
+ globalData.out = out;
+ globalData.entities = xmlHashCreate(8);
+ globalData.oldLoader = xmlGetExternalEntityLoader();
+ xmlSetExternalEntityLoader(fuzzEntityRecorder);
+}
+
+static void
+fuzzRecorderCleanup() {
+ xmlSetExternalEntityLoader(globalData.oldLoader);
+ xmlHashFree(globalData.entities, xmlHashDefaultDeallocator);
+ globalData.out = NULL;
+ globalData.entities = NULL;
+ globalData.oldLoader = NULL;
+}
+
+static int
+processXml(const char *docFile, FILE *out) {
+ int opts = XML_PARSE_NOENT | XML_PARSE_DTDLOAD;
+ xmlDocPtr doc;
+
+ fwrite(&opts, sizeof(opts), 1, out);
+
+ fuzzRecorderInit(out);
+
+ doc = xmlReadFile(docFile, NULL, opts);
+ xmlXIncludeProcessFlags(doc, opts);
+ xmlFreeDoc(doc);
+
+ fuzzRecorderCleanup();
+
+ return(0);
+}
+
+static int
+processHtml(const char *docFile, FILE *out) {
+ char buf[SEED_BUF_SIZE];
+ FILE *file;
+ size_t size;
+ int opts = 0;
+
+ fwrite(&opts, sizeof(opts), 1, out);
+
+ /* Copy file */
+ file = fopen(docFile, "rb");
+ if (file == NULL) {
+ fprintf(stderr, "couldn't open %s\n", docFile);
+ return(0);
+ }
+ do {
+ size = fread(buf, 1, SEED_BUF_SIZE, file);
+ if (size > 0)
+ fwrite(buf, 1, size, out);
+ } while (size == SEED_BUF_SIZE);
+ fclose(file);
+
+ return(0);
+}
+
+static int
+processSchema(const char *docFile, FILE *out) {
+ xmlSchemaPtr schema;
+ xmlSchemaParserCtxtPtr pctxt;
+
+ fuzzRecorderInit(out);
+
+ pctxt = xmlSchemaNewParserCtxt(docFile);
+ xmlSchemaSetParserErrors(pctxt, xmlFuzzErrorFunc, xmlFuzzErrorFunc, NULL);
+ schema = xmlSchemaParse(pctxt);
+ xmlSchemaFreeParserCtxt(pctxt);
+ xmlSchemaFree(schema);
+
+ fuzzRecorderCleanup();
+
+ return(0);
+}
+
+static int
+processPattern(const char *pattern) {
+ glob_t globbuf;
+ int ret = 0;
+ int res, i;
+
+ res = glob(pattern, 0, NULL, &globbuf);
+ if (res == GLOB_NOMATCH)
+ return(0);
+ if (res != 0) {
+ fprintf(stderr, "couldn't match pattern %s\n", pattern);
+ return(-1);
+ }
+
+ for (i = 0; i < globbuf.gl_pathc; i++) {
+ struct stat statbuf;
+ char outPath[PATH_SIZE];
+ char *dirBuf = NULL;
+ char *baseBuf = NULL;
+ const char *path, *dir, *base;
+ FILE *out = NULL;
+ int dirChanged = 0;
+ size_t size;
+
+ path = globbuf.gl_pathv[i];
+
+ if ((stat(path, &statbuf) != 0) || (!S_ISREG(statbuf.st_mode)))
+ continue;
+
+ dirBuf = (char *) xmlCharStrdup(path);
+ baseBuf = (char *) xmlCharStrdup(path);
+ if ((dirBuf == NULL) || (baseBuf == NULL)) {
+ fprintf(stderr, "memory allocation failed\n");
+ ret = -1;
+ goto error;
+ }
+ dir = dirname(dirBuf);
+ base = basename(baseBuf);
+
+ size = snprintf(outPath, sizeof(outPath), "seed/%s/%s",
+ globalData.fuzzer, base);
+ if (size >= PATH_SIZE) {
+ fprintf(stderr, "creating path failed\n");
+ ret = -1;
+ goto error;
+ }
+ out = fopen(outPath, "wb");
+ if (out == NULL) {
+ fprintf(stderr, "couldn't open %s for writing\n", outPath);
+ ret = -1;
+ goto error;
+ }
+ if (chdir(dir) != 0) {
+ fprintf(stderr, "couldn't chdir to %s\n", dir);
+ ret = -1;
+ goto error;
+ }
+ dirChanged = 1;
+ if (globalData.processFile(base, out) != 0)
+ ret = -1;
+
+error:
+ if (out != NULL)
+ fclose(out);
+ xmlFree(dirBuf);
+ xmlFree(baseBuf);
+ if ((dirChanged) && (chdir(globalData.cwd) != 0)) {
+ fprintf(stderr, "couldn't chdir to %s\n", globalData.cwd);
+ ret = -1;
+ break;
+ }
+ }
+
+ globfree(&globbuf);
+ return(ret);
+}
+
+static int
+processXPath(const char *testDir, const char *prefix, const char *name,
+ const char *data, const char *subdir, int xptr) {
+ char pattern[PATH_SIZE];
+ glob_t globbuf;
+ size_t i, size;
+ int ret = 0, res;
+
+ size = snprintf(pattern, sizeof(pattern), "%s/%s/%s*",
+ testDir, subdir, prefix);
+ if (size >= PATH_SIZE)
+ return(-1);
+ res = glob(pattern, 0, NULL, &globbuf);
+ if (res == GLOB_NOMATCH)
+ return(0);
+ if (res != 0) {
+ fprintf(stderr, "couldn't match pattern %s\n", pattern);
+ return(-1);
+ }
+
+ for (i = 0; i < globbuf.gl_pathc; i++) {
+ char *path = globbuf.gl_pathv[i];
+ struct stat statbuf;
+ FILE *in;
+ char expr[EXPR_SIZE];
+
+ if ((stat(path, &statbuf) != 0) || (!S_ISREG(statbuf.st_mode)))
+ continue;
+
+ in = fopen(path, "rb");
+ if (in == NULL) {
+ ret = -1;
+ continue;
+ }
+
+ while (fgets(expr, EXPR_SIZE, in) > 0) {
+ char outPath[PATH_SIZE];
+ FILE *out;
+ int j;
+
+ for (j = 0; expr[j] != 0; j++)
+ if (expr[j] == '\r' || expr[j] == '\n')
+ break;
+ expr[j] = 0;
+
+ size = snprintf(outPath, sizeof(outPath), "seed/xpath/%s-%d",
+ name, globalData.counter);
+ if (size >= PATH_SIZE) {
+ ret = -1;
+ continue;
+ }
+ out = fopen(outPath, "wb");
+ if (out == NULL) {
+ ret = -1;
+ continue;
+ }
+
+ if (xptr) {
+ xmlFuzzWriteString(out, expr);
+ } else {
+ char xptrExpr[EXPR_SIZE+100];
+
+ /* Wrap XPath expressions as XPointer */
+ snprintf(xptrExpr, sizeof(xptrExpr), "xpointer(%s)", expr);
+ xmlFuzzWriteString(out, xptrExpr);
+ }
+
+ xmlFuzzWriteString(out, data);
+
+ fclose(out);
+ globalData.counter++;
+ }
+
+ fclose(in);
+ }
+
+ globfree(&globbuf);
+
+ return(ret);
+}
+
+int
+processXPathDir(const char *testDir) {
+ char pattern[PATH_SIZE];
+ glob_t globbuf;
+ size_t i, size;
+ int ret = 0;
+
+ globalData.counter = 1;
+ if (processXPath(testDir, "", "expr", "<d></d>", "expr", 0) != 0)
+ ret = -1;
+
+ size = snprintf(pattern, sizeof(pattern), "%s/docs/*", testDir);
+ if (size >= PATH_SIZE)
+ return(1);
+ if (glob(pattern, 0, NULL, &globbuf) != 0)
+ return(1);
+
+ for (i = 0; i < globbuf.gl_pathc; i++) {
+ char *path = globbuf.gl_pathv[i];
+ char *data;
+ const char *docFile;
+
+ data = xmlSlurpFile(path, NULL);
+ if (data == NULL) {
+ ret = -1;
+ continue;
+ }
+ docFile = basename(path);
+
+ globalData.counter = 1;
+ if (processXPath(testDir, docFile, docFile, data, "tests", 0) != 0)
+ ret = -1;
+ if (processXPath(testDir, docFile, docFile, data, "xptr", 1) != 0)
+ ret = -1;
+
+ xmlFree(data);
+ }
+
+ globfree(&globbuf);
+
+ return(ret);
+}
+
+int
+main(int argc, const char **argv) {
+ mainFunc processArg = processPattern;
+ const char *fuzzer;
+ int ret = 0;
+ int xpath = 0;
+ int i;
+
+ if (argc < 3) {
+ fprintf(stderr, "usage: seed [FUZZER] [PATTERN...]\n");
+ return(1);
+ }
+
+ xmlSetGenericErrorFunc(NULL, xmlFuzzErrorFunc);
+
+ fuzzer = argv[1];
+ if (strcmp(fuzzer, "html") == 0) {
+ globalData.processFile = processHtml;
+ } else if (strcmp(fuzzer, "schema") == 0) {
+ globalData.processFile = processSchema;
+ } else if (strcmp(fuzzer, "xml") == 0) {
+ globalData.processFile = processXml;
+ } else if (strcmp(fuzzer, "xpath") == 0) {
+ processArg = processXPathDir;
+ } else {
+ fprintf(stderr, "unknown fuzzer %s\n", fuzzer);
+ return(1);
+ }
+ globalData.fuzzer = fuzzer;
+
+ if (getcwd(globalData.cwd, PATH_SIZE) == NULL) {
+ fprintf(stderr, "couldn't get current directory\n");
+ return(1);
+ }
+
+ for (i = 2; i < argc; i++)
+ processArg(argv[i]);
+
+ return(ret);
+}
+
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]