[libxml2] Update fuzzing code

From: Nick Wellnhofer <nwellnhof src gnome org>
To: commits-list gnome org
Cc:
Subject: [libxml2] Update fuzzing code
Date: Fri, 31 Jul 2020 10:08:08 +0000 (UTC)
commit 905820a44c0c895c02124ecacff735794509f4fe
Author: Nick Wellnhofer <wellnhofer aevum de>
Date:   Sun Jul 12 22:59:39 2020 +0200

    Update fuzzing code
    
    - Shorten timeouts
    - Align options from Makefile and options files
    - Add section headers to Makefile
    - Skip invalid UTF-8 in regexp fuzzer
    - Update regexp.dict
    - Generate HTML seed corpus in correct format

 fuzz/.gitignore     |   2 +
 fuzz/Makefile.am    |  54 ++++++++++++++------
 fuzz/html.options   |   1 +
 fuzz/htmlSeed.c     |  36 ++++++++++++++
 fuzz/regexp.c       |  15 +++---
 fuzz/regexp.dict    | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fuzz/regexp.options |   3 ++
 fuzz/schema.options |   1 +
 fuzz/uri.options    |   3 ++
 fuzz/xml.options    |   1 +
 10 files changed, 233 insertions(+), 22 deletions(-)
---
diff --git a/fuzz/.gitignore b/fuzz/.gitignore
index d7ea79643..eecb4aea4 100644
--- a/fuzz/.gitignore
+++ b/fuzz/.gitignore
@@ -1,8 +1,10 @@
 corpus/
 html
+htmlSeed
 regexp
 schema
 schemaSeed
+seed/html*
 seed/xml*
 seed/schema*
 testFuzzer
diff --git a/fuzz/Makefile.am b/fuzz/Makefile.am
index c360e567d..9a1225dba 100644
--- a/fuzz/Makefile.am
+++ b/fuzz/Makefile.am
@@ -1,11 +1,11 @@
-EXTRA_PROGRAMS = html regexp uri schema schemaSeed xml xmlSeed
+EXTRA_PROGRAMS = html htmlSeed regexp uri schema schemaSeed xml xmlSeed
 check_PROGRAMS = testFuzzer
 CLEANFILES = $(EXTRA_PROGRAMS)
 AM_CPPFLAGS = -I$(top_srcdir)/include
 DEPENDENCIES = $(top_builddir)/libxml2.la
 LDADD = $(STATIC_BINARIES) $(top_builddir)/libxml2.la $(THREAD_LIBS) $(Z_LIBS) $(LZMA_LIBS) $(ICONV_LIBS) 
$(M_LIBS) $(WIN32_EXTRA_LIBADD)
 
-PARSER_FUZZER_MAX_LEN = 100000
+XML_MAX_LEN = 80000
 XML_SEED_CORPUS_SRC = \
     $(top_srcdir)/test/* \
     $(top_srcdir)/test/errors/*.xml \
@@ -16,6 +16,14 @@ XML_SEED_CORPUS_SRC = \
     $(top_srcdir)/test/VC/* \
     $(top_srcdir)/test/VCM/*
 
+testFuzzer_SOURCES = testFuzzer.c fuzz.c
+
+tests: testFuzzer$(EXEEXT)
+       @echo "## Running fuzzer tests"
+       @./testFuzzer$(EXEEXT)
+
+# XML fuzzer
+
 xmlSeed_SOURCES = xmlSeed.c fuzz.c
 
 seed/xml.stamp: xmlSeed$(EXEEXT)
@@ -28,19 +36,13 @@ seed/xml.stamp: xmlSeed$(EXEEXT)
                 pushd $$(dirname $$i) >/dev/null; \
                $(abs_builddir)/xmlSeed$(EXEEXT) $$base > $$outfile; \
                 popd >/dev/null; \
-               if [ "$$(wc -c < $$outfile)" -gt $(PARSER_FUZZER_MAX_LEN) ]; then \
+               if [ "$$(wc -c < $$outfile)" -gt $(XML_MAX_LEN) ]; then \
                    rm $$outfile; \
                fi; \
            fi; \
        done
        @touch seed/xml.stamp
 
-testFuzzer_SOURCES = testFuzzer.c fuzz.c
-
-tests: testFuzzer$(EXEEXT)
-       @echo "## Running fuzzer tests"
-       @./testFuzzer$(EXEEXT)
-
 xml_SOURCES = xml.c fuzz.c
 xml_LDFLAGS = -fsanitize=fuzzer
 
@@ -48,20 +50,36 @@ fuzz-xml: xml$(EXEEXT) seed/xml.stamp
        @mkdir -p corpus/xml
        ./xml$(EXEEXT) \
            -dict=xml.dict \
-           -max_len=$(PARSER_FUZZER_MAX_LEN) \
+           -max_len=$(XML_MAX_LEN) \
            -timeout=20 \
            corpus/xml seed/xml
 
+# HTML fuzzer
+
+htmlSeed_SOURCES = htmlSeed.c fuzz.c
+
+seed/html.stamp: htmlSeed$(EXEEXT)
+       @mkdir -p seed/html
+       @for i in $(top_srcdir)/test/HTML/*; do \
+           if [ -f $$i ]; then \
+               echo Processing seed $$i; \
+               ./htmlSeed$(EXEEXT) $$i > seed/html/$$(basename $$i); \
+           fi; \
+       done
+       @touch seed/html.stamp
+
 html_SOURCES = html.c fuzz.c
 html_LDFLAGS = -fsanitize=fuzzer
 
-fuzz-html: html$(EXEEXT)
+fuzz-html: html$(EXEEXT) seed/html.stamp
        @mkdir -p corpus/html
        ./html$(EXEEXT) \
            -dict=html.dict \
            -max_len=1000000 \
            -timeout=20 \
-           corpus/html $(top_srcdir)/test/HTML
+           corpus/html seed/html
+
+# Regexp fuzzer
 
 regexp_SOURCES = regexp.c fuzz.c
 regexp_LDFLAGS = -fsanitize=fuzzer
@@ -70,10 +88,12 @@ fuzz-regexp: regexp$(EXEEXT)
        @mkdir -p corpus/regexp
        ./regexp$(EXEEXT) \
            -dict=regexp.dict \
-           -max_len=10000 \
-           -timeout=20 \
+           -max_len=200 \
+           -timeout=5 \
            corpus/regexp $(srcdir)/seed/regexp
 
+# URI fuzzer
+
 uri_SOURCES = uri.c fuzz.c
 uri_LDFLAGS = -fsanitize=fuzzer
 
@@ -81,9 +101,11 @@ fuzz-uri: uri$(EXEEXT)
        @mkdir -p corpus/uri
        ./uri$(EXEEXT) \
            -max_len=10000 \
-           -timeout=2 \
+           -timeout=5 \
            corpus/uri $(srcdir)/seed/uri
 
+# XML Schema fuzzer
+
 schemaSeed_SOURCES = schemaSeed.c fuzz.c
 
 seed/schema.stamp: schemaSeed$(EXEEXT)
@@ -107,7 +129,7 @@ fuzz-schema: schema$(EXEEXT) seed/schema.stamp
        @mkdir -p corpus/schema
        ./schema$(EXEEXT) \
            -dict=schema.dict \
-           -max_len=$(PARSER_FUZZER_MAX_LEN) \
+           -max_len=$(XML_MAX_LEN) \
            -timeout=20 \
            corpus/schema seed/schema
 
diff --git a/fuzz/html.options b/fuzz/html.options
index e5ae71b93..a32c583ef 100644
--- a/fuzz/html.options
+++ b/fuzz/html.options
@@ -1,2 +1,3 @@
 [libfuzzer]
 max_len = 1000000
+timeout = 20
diff --git a/fuzz/htmlSeed.c b/fuzz/htmlSeed.c
new file mode 100644
index 000000000..f3213e2ec
--- /dev/null
+++ b/fuzz/htmlSeed.c
@@ -0,0 +1,36 @@
+/*
+ * htmlSeed.c: Generate the HTML seed corpus for fuzzing.
+ *
+ * See Copyright for the status of this software.
+ */
+
+#include <stdio.h>
+
+#define SEED_BUF_SIZE 16384
+
+int
+main(int argc, char **argv) {
+    int opts = 0;
+    FILE *file;
+    char buf[SEED_BUF_SIZE];
+    size_t size;
+
+    if (argc != 2) {
+        fprintf(stderr, "Usage: htmlSeed [FILE]\n");
+        return(1);
+    }
+
+    fwrite(&opts, sizeof(opts), 1, stdout);
+
+    /* Copy file */
+    file = fopen(argv[1], "rb");
+    do {
+        size = fread(buf, 1, SEED_BUF_SIZE, file);
+        if (size > 0)
+            fwrite(buf, 1, size, stdout);
+    } while (size == SEED_BUF_SIZE);
+    fclose(file);
+
+    return(0);
+}
+
diff --git a/fuzz/regexp.c b/fuzz/regexp.c
index ed13f637d..3b35671b4 100644
--- a/fuzz/regexp.c
+++ b/fuzz/regexp.c
@@ -23,14 +23,17 @@ LLVMFuzzerTestOneInput(const char *data, size_t size) {
 
     numStrings = xmlFuzzExtractStrings(data, size, str, 2);
 
-    regexp = xmlRegexpCompile(BAD_CAST str[0]);
-    /* xmlRegexpExec has pathological performance in too many cases. */
+    /* CUR_SCHAR doesn't handle invalid UTF-8 and may cause infinite loops. */
+    if (xmlCheckUTF8(BAD_CAST str[0]) != 0) {
+        regexp = xmlRegexpCompile(BAD_CAST str[0]);
+        /* xmlRegexpExec has pathological performance in too many cases. */
 #if 0
-    if ((regexp != NULL) && (numStrings >= 2)) {
-        xmlRegexpExec(regexp, BAD_CAST str[1]);
-    }
+        if ((regexp != NULL) && (numStrings >= 2)) {
+            xmlRegexpExec(regexp, BAD_CAST str[1]);
+        }
 #endif
-    xmlRegFreeRegexp(regexp);
+        xmlRegFreeRegexp(regexp);
+    }
 
     xmlFree(str[0]);
     xmlFree(str[1]);
diff --git a/fuzz/regexp.dict b/fuzz/regexp.dict
index 06b74a6ce..30d666dcf 100644
--- a/fuzz/regexp.dict
+++ b/fuzz/regexp.dict
@@ -3,9 +3,19 @@ quant_opt="?"
 quant_some="+"
 quant_num="{1,2}"
 
+dot="."
 branch="|a"
+parens="()"
+parens_inner=")("
 pos_group="[a]"
 neg_group="[^a]"
+class_subtraction="[a-[b]]"
+
+esc_space="\\s"
+esc_initial="\\i"
+esc_name="\\c"
+esc_digit="\\d"
+esc_word="\\w"
 
 cat_letter="\\p{L}"
 cat_mark="\\p{M}"
@@ -14,3 +24,132 @@ cat_punct="\\p{P}"
 cat_sym="\\p{S}"
 cat_sep="\\p{Z}"
 cat_other="\\p{C}"
+
+block_aegean_numbers="\\p{IsAegeanNumbers}"
+block_alphabetic_presentation_forms="\\p{IsAlphabeticPresentationForms}"
+block_arabic="\\p{IsArabic}"
+block_arabic_presentation_forms_a="\\p{IsArabicPresentationFormsA}"
+block_arabic_presentation_forms_b="\\p{IsArabicPresentationFormsB}"
+block_armenian="\\p{IsArmenian}"
+block_arrows="\\p{IsArrows}"
+block_basic_latin="\\p{IsBasicLatin}"
+block_bengali="\\p{IsBengali}"
+block_block_elements="\\p{IsBlockElements}"
+block_bopomofo="\\p{IsBopomofo}"
+block_bopomofo_extended="\\p{IsBopomofoExtended}"
+block_box_drawing="\\p{IsBoxDrawing}"
+block_braille_patterns="\\p{IsBraillePatterns}"
+block_buhid="\\p{IsBuhid}"
+block_byzantine_musical_symbols="\\p{IsByzantineMusicalSymbols}"
+block_c_j_k_compatibility="\\p{IsCJKCompatibility}"
+block_c_j_k_compatibility_forms="\\p{IsCJKCompatibilityForms}"
+block_c_j_k_compatibility_ideographs="\\p{IsCJKCompatibilityIdeographs}"
+block_c_j_k_compatibility_ideographs_supplement="\\p{IsCJKCompatibilityIdeographsSupplement}"
+block_c_j_k_radicals_supplement="\\p{IsCJKRadicalsSupplement}"
+block_c_j_k_symbolsand_punctuation="\\p{IsCJKSymbolsandPunctuation}"
+block_c_j_k_unified_ideographs="\\p{IsCJKUnifiedIdeographs}"
+block_c_j_k_unified_ideographs_extension_a="\\p{IsCJKUnifiedIdeographsExtensionA}"
+block_cjk_unified_ideographs_extension_b="\\p{IsCJKUnifiedIdeographsExtensionB}"
+block_cherokee="\\p{IsCherokee}"
+block_combining_diacritical_marks="\\p{IsCombiningDiacriticalMarks}"
+block_combining_diacritical_marksfor_symbols="\\p{IsCombiningDiacriticalMarksforSymbols}"
+block_combining_half_marks="\\p{IsCombiningHalfMarks}"
+block_combining_marksfor_symbols="\\p{IsCombiningMarksforSymbols}"
+block_control_pictures="\\p{IsControlPictures}"
+block_currency_symbols="\\p{IsCurrencySymbols}"
+block_cypriot_syllabary="\\p{IsCypriotSyllabary}"
+block_cyrillic="\\p{IsCyrillic}"
+block_cyrillic_supplement="\\p{IsCyrillicSupplement}"
+block_deseret="\\p{IsDeseret}"
+block_devanagari="\\p{IsDevanagari}"
+block_dingbats="\\p{IsDingbats}"
+block_enclosed_alphanumerics="\\p{IsEnclosedAlphanumerics}"
+block_enclosed_cjk_lettersand_months="\\p{IsEnclosedCJKLettersandMonths}"
+block_ethiopic="\\p{IsEthiopic}"
+block_general_punctuation="\\p{IsGeneralPunctuation}"
+block_geometric_shapes="\\p{IsGeometricShapes}"
+block_georgian="\\p{IsGeorgian}"
+block_gothic="\\p{IsGothic}"
+block_greek="\\p{IsGreek}"
+block_greek_extended="\\p{IsGreekExtended}"
+block_greekand_coptic="\\p{IsGreekandCoptic}"
+block_gujarati="\\p{IsGujarati}"
+block_gurmukhi="\\p{IsGurmukhi}"
+block_halfwidthand_fullwidth_forms="\\p{IsHalfwidthandFullwidthForms}"
+block_hangul_compatibility_jamo="\\p{IsHangulCompatibilityJamo}"
+block_hangul_jamo="\\p{IsHangulJamo}"
+block_hangul_syllables="\\p{IsHangulSyllables}"
+block_hanunoo="\\p{IsHanunoo}"
+block_hebrew="\\p{IsHebrew}"
+block_high_private_use_surrogates="\\p{IsHighPrivateUseSurrogates}"
+block_high_surrogates="\\p{IsHighSurrogates}"
+block_hiragana="\\p{IsHiragana}"
+block_ipa_extensions="\\p{IsIPAExtensions}"
+block_ideographic_description_characters="\\p{IsIdeographicDescriptionCharacters}"
+block_kanbun="\\p{IsKanbun}"
+block_kangxi_radicals="\\p{IsKangxiRadicals}"
+block_kannada="\\p{IsKannada}"
+block_katakana="\\p{IsKatakana}"
+block_katakana_phonetic_extensions="\\p{IsKatakanaPhoneticExtensions}"
+block_khmer="\\p{IsKhmer}"
+block_khmer_symbols="\\p{IsKhmerSymbols}"
+block_lao="\\p{IsLao}"
+block_latin1Supplement="\\p{IsLatin1Supplement}"
+block_latin_extended_a="\\p{IsLatinExtendedA}"
+block_latin_extended_b="\\p{IsLatinExtendedB}"
+block_latin_extended_additional="\\p{IsLatinExtendedAdditional}"
+block_letterlike_symbols="\\p{IsLetterlikeSymbols}"
+block_limbu="\\p{IsLimbu}"
+block_linear_b_ideograms="\\p{IsLinearBIdeograms}"
+block_linear_b_syllabary="\\p{IsLinearBSyllabary}"
+block_low_surrogates="\\p{IsLowSurrogates}"
+block_malayalam="\\p{IsMalayalam}"
+block_mathematical_alphanumeric_symbols="\\p{IsMathematicalAlphanumericSymbols}"
+block_mathematical_operators="\\p{IsMathematicalOperators}"
+block_miscellaneous_mathematical_symbols_a="\\p{IsMiscellaneousMathematicalSymbolsA}"
+block_miscellaneous_mathematical_symbols_b="\\p{IsMiscellaneousMathematicalSymbolsB}"
+block_miscellaneous_symbols="\\p{IsMiscellaneousSymbols}"
+block_miscellaneous_symbolsand_arrows="\\p{IsMiscellaneousSymbolsandArrows}"
+block_miscellaneous_technical="\\p{IsMiscellaneousTechnical}"
+block_mongolian="\\p{IsMongolian}"
+block_musical_symbols="\\p{IsMusicalSymbols}"
+block_myanmar="\\p{IsMyanmar}"
+block_number_forms="\\p{IsNumberForms}"
+block_ogham="\\p{IsOgham}"
+block_old_italic="\\p{IsOldItalic}"
+block_optical_character_recognition="\\p{IsOpticalCharacterRecognition}"
+block_oriya="\\p{IsOriya}"
+block_osmanya="\\p{IsOsmanya}"
+block_phonetic_extensions="\\p{IsPhoneticExtensions}"
+block_private_use="\\p{IsPrivateUse}"
+block_private_use_area="\\p{IsPrivateUseArea}"
+block_runic="\\p{IsRunic}"
+block_shavian="\\p{IsShavian}"
+block_sinhala="\\p{IsSinhala}"
+block_small_form_variants="\\p{IsSmallFormVariants}"
+block_spacing_modifier_letters="\\p{IsSpacingModifierLetters}"
+block_specials="\\p{IsSpecials}"
+block_superscriptsand_subscripts="\\p{IsSuperscriptsandSubscripts}"
+block_supplemental_arrows_a="\\p{IsSupplementalArrowsA}"
+block_supplemental_arrows_b="\\p{IsSupplementalArrowsB}"
+block_supplemental_mathematical_operators="\\p{IsSupplementalMathematicalOperators}"
+block_supplementary_private_use_area_a="\\p{IsSupplementaryPrivateUseAreaA}"
+block_supplementary_private_use_area_b="\\p{IsSupplementaryPrivateUseAreaB}"
+block_syriac="\\p{IsSyriac}"
+block_tagalog="\\p{IsTagalog}"
+block_tagbanwa="\\p{IsTagbanwa}"
+block_tags="\\p{IsTags}"
+block_tai_le="\\p{IsTaiLe}"
+block_tai_xuan_jing_symbols="\\p{IsTaiXuanJingSymbols}"
+block_tamil="\\p{IsTamil}"
+block_telugu="\\p{IsTelugu}"
+block_thaana="\\p{IsThaana}"
+block_thai="\\p{IsThai}"
+block_tibetan="\\p{IsTibetan}"
+block_ugaritic="\\p{IsUgaritic}"
+block_unified_canadian_aboriginal_syllabics="\\p{IsUnifiedCanadianAboriginalSyllabics}"
+block_variation_selectors="\\p{IsVariationSelectors}"
+block_variation_selectors_supplement="\\p{IsVariationSelectorsSupplement}"
+block_yi_radicals="\\p{IsYiRadicals}"
+block_yi_syllables="\\p{IsYiSyllables}"
+block_yijing_hexagram_symbols="\\p{IsYijingHexagramSymbols}"
diff --git a/fuzz/regexp.options b/fuzz/regexp.options
new file mode 100644
index 000000000..09b9e6f07
--- /dev/null
+++ b/fuzz/regexp.options
@@ -0,0 +1,3 @@
+[libfuzzer]
+max_len = 200
+timeout = 5
diff --git a/fuzz/schema.options b/fuzz/schema.options
index 09f13d891..195ec544f 100644
--- a/fuzz/schema.options
+++ b/fuzz/schema.options
@@ -1,2 +1,3 @@
 [libfuzzer]
 max_len = 80000
+timeout = 20
diff --git a/fuzz/uri.options b/fuzz/uri.options
new file mode 100644
index 000000000..8c45a722e
--- /dev/null
+++ b/fuzz/uri.options
@@ -0,0 +1,3 @@
+[libfuzzer]
+max_len = 10000
+timeout = 5
diff --git a/fuzz/xml.options b/fuzz/xml.options
index 09f13d891..195ec544f 100644
--- a/fuzz/xml.options
+++ b/fuzz/xml.options
@@ -1,2 +1,3 @@
 [libfuzzer]
 max_len = 80000
+timeout = 20
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]