diff --git a/common/experiment_utils.py b/common/experiment_utils.py index c533d12ec..1f6b120a1 100644 --- a/common/experiment_utils.py +++ b/common/experiment_utils.py @@ -72,6 +72,12 @@ def get_oss_fuzz_corpora_filestore_path(): return posixpath.join(get_experiment_filestore_path(), 'oss_fuzz_corpora') +def get_random_seed_corpora_filestore_path(): + """Returns path containing the user-provided seed corpora.""" + return posixpath.join(get_experiment_filestore_path(), + 'random_seed_corpora') + + def get_dispatcher_instance_name(experiment: str) -> str: """Returns a dispatcher instance name for an experiment.""" return 'd-%s' % experiment diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh index ac43f655f..aad4e0f16 100644 --- a/experiment/resources/runner-startup-script-template.sh +++ b/experiment/resources/runner-startup-script-template.sh @@ -46,6 +46,7 @@ docker run \ -e NO_SEEDS={{no_seeds}} \ -e NO_DICTIONARIES={{no_dictionaries}} \ -e OSS_FUZZ_CORPUS={{oss_fuzz_corpus}} \ +-e RANDOM_SEED_CORPUS_DIR={{random_seed_corpus_dir}} \ -e DOCKER_REGISTRY={{docker_registry}} {% if not local_experiment %}-e CLOUD_PROJECT={{cloud_project}} -e CLOUD_COMPUTE_ZONE={{cloud_compute_zone}} {% endif %}\ -e EXPERIMENT_FILESTORE={{experiment_filestore}} {% if local_experiment %}-v {{experiment_filestore}}:{{experiment_filestore}} {% endif %}\ -e REPORT_FILESTORE={{report_filestore}} {% if local_experiment %}-v {{report_filestore}}:{{report_filestore}} {% endif %}\ diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py index 2bc6e0622..6c654c1c7 100644 --- a/experiment/run_experiment.py +++ b/experiment/run_experiment.py @@ -22,6 +22,7 @@ import sys import tarfile import tempfile +import zipfile from typing import Dict, List import jinja2 @@ -63,6 +64,9 @@ 'gs://{project}-backup.clusterfuzz-external.appspot.com/corpus/' 'libFuzzer/{fuzz_target}/public.zip') +# max size allowed per seed corpus for AFL +CORPUS_ELEMENT_BYTES_LIMIT = 1 * 1024 * 1024 + def read_and_validate_experiment_config(config_filename: str) -> Dict: """Reads |config_filename|, validates it, finds as many errors as possible, @@ -148,6 +152,48 @@ def get_directories(parent_dir): ] +# pylint: disable=too-many-locals +def validate_and_pack_random_seed_corpus(random_seed_corpus_dir, benchmarks): + """Validate and archive seed corpus provided by user and.""" + if not os.path.isdir(random_seed_corpus_dir): + raise ValidationError('Corpus location "%s" is invalid.' % + random_seed_corpus_dir) + + for benchmark in benchmarks: + benchmark_corpus_dir = os.path.join(random_seed_corpus_dir, benchmark) + if not os.path.exists(benchmark_corpus_dir): + raise ValidationError('Random seed corpus directory for ' + 'benchmark "%s" does not exist.' % benchmark) + if not os.path.isdir(benchmark_corpus_dir): + raise ValidationError('Seed corpus of benchmark "%s" must be ' + 'a directory.' % benchmark) + if not os.listdir(benchmark_corpus_dir): + raise ValidationError('Seed corpus of benchmark "%s" is empty.' % + benchmark) + + valid_corpus_files = set() + for root, _, files in os.walk(benchmark_corpus_dir): + for filename in files: + file_path = os.path.join(root, filename) + file_size = os.path.getsize(file_path) + + if file_size == 0 or file_size > CORPUS_ELEMENT_BYTES_LIMIT: + continue + valid_corpus_files.add(file_path) + + if not valid_corpus_files: + raise ValidationError('No valid corpus files for "%s"' % benchmark) + + benchmark_corpus_archive_path = os.path.join(random_seed_corpus_dir, + f'{benchmark}.zip') + with zipfile.ZipFile(benchmark_corpus_archive_path, 'w') as archive: + for filename in valid_corpus_files: + dir_name = os.path.dirname(filename) + archive.write( + filename, + os.path.relpath(filename, os.path.join(dir_name, '..'))) + + def validate_benchmarks(benchmarks: List[str]): """Parses and validates list of benchmarks.""" benchmark_types = set() @@ -220,7 +266,8 @@ def start_experiment( # pylint: disable=too-many-arguments concurrent_builds=None, measurers_cpus=None, runners_cpus=None, - use_branch_coverage=False): + use_branch_coverage=False, + random_seed_corpus_dir=None): """Start a fuzzer benchmarking experiment.""" if not allow_uncommitted_changes: check_no_uncommitted_changes() @@ -250,6 +297,12 @@ def start_experiment( # pylint: disable=too-many-arguments # 12GB is just the amount that KLEE needs, use this default to make KLEE # experiments easier to run. config['runner_memory'] = config.get('runner_memory', '12GB') + + config['random_seed_corpus_dir'] = random_seed_corpus_dir + if config['random_seed_corpus_dir']: + validate_and_pack_random_seed_corpus(config['random_seed_corpus_dir'], + benchmarks) + return start_experiment_from_full_config(config) @@ -332,6 +385,16 @@ def filter_file(tar_info): for benchmark in config['benchmarks']: add_oss_fuzz_corpus(benchmark, oss_fuzz_corpora_dir) + if config['random_seed_corpus_dir']: + for benchmark in config['benchmarks']: + benchmark_corpus_archive_path = os.path.join( + config['random_seed_corpus_dir'], f'{benchmark}.zip') + filestore_utils.cp( + benchmark_corpus_archive_path, + experiment_utils.get_random_seed_corpora_filestore_path() + '/', + recursive=True, + parallel=True) + class BaseDispatcher: """Class representing the dispatcher.""" @@ -524,6 +587,10 @@ def main(): '--runners-cpus', help='Cpus available to the runners.', required=False) + parser.add_argument('-rs', + '--random-seed-corpus-dir', + help='Path to the random seed corpus', + required=False) all_fuzzers = fuzzer_utils.get_fuzzer_names() parser.add_argument('-f', @@ -593,6 +660,14 @@ def main(): parser.error('The sum of runners and measurers cpus is greater than the' ' available cpu cores (%d)' % os.cpu_count()) + if args.random_seed_corpus_dir: + if args.no_seeds: + parser.error('Cannot enable options "random_seed_corpus_dir" and ' + '"no_seeds" at the same time') + if args.oss_fuzz_corpus: + parser.error('Cannot enable options "random_seed_corpus_dir" and ' + '"oss_fuzz_corpus" at the same time') + start_experiment(args.experiment_name, args.experiment_config, args.benchmarks, @@ -605,7 +680,8 @@ def main(): concurrent_builds=concurrent_builds, measurers_cpus=measurers_cpus, runners_cpus=runners_cpus, - use_branch_coverage=args.use_branch_coverage) + use_branch_coverage=args.use_branch_coverage, + random_seed_corpus_dir=args.random_seed_corpus_dir) return 0 diff --git a/experiment/runner.py b/experiment/runner.py index 94cc9305e..af97df09b 100644 --- a/experiment/runner.py +++ b/experiment/runner.py @@ -27,6 +27,7 @@ import threading import time import zipfile +import random from common import benchmark_config from common import environment @@ -115,6 +116,20 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path): return seed_corpus_path if os.path.exists(seed_corpus_path) else None +def _unpack_random_seed_corpus(corpus_directory): + "Unpack and randomply pick one input from the seed corpus provided by user" + # remove initial seed corpus + shutil.rmtree(corpus_directory) + os.mkdir(corpus_directory) + benchmark = environment.get('BENCHMARK') + corpus_archive_filename = posixpath.join( + experiment_utils.get_random_seed_corpora_filestore_path(), + f'{benchmark}.zip') + with zipfile.ZipFile(corpus_archive_filename) as zip_file: + selected_file = random.choice(zip_file.infolist()) + zip_file.extract(selected_file, corpus_directory) + + def _unpack_clusterfuzz_seed_corpus(fuzz_target_path, corpus_directory): """If a clusterfuzz seed corpus archive is available, unpack it into the corpus directory if it exists. Copied from unpack_seed_corpus in @@ -172,7 +187,10 @@ def run_fuzzer(max_total_time, log_filename): logs.error('Fuzz target binary not found.') return - _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus) + if environment.get('RANDOM_SEED_CORPUS_DIR'): + _unpack_random_seed_corpus(input_corpus) + else: + _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus) _clean_seed_corpus(input_corpus) if max_total_time is None: diff --git a/experiment/scheduler.py b/experiment/scheduler.py index 0f8946001..910dcba86 100644 --- a/experiment/scheduler.py +++ b/experiment/scheduler.py @@ -717,6 +717,7 @@ def render_startup_script_template(instance_name: str, fuzzer: str, 'oss_fuzz_corpus': experiment_config['oss_fuzz_corpus'], 'num_cpu_cores': experiment_config['runner_num_cpu_cores'], 'cpuset': CPUSET, + 'random_seed_corpus_dir': experiment_config['random_seed_corpus_dir'], } if not local_experiment: diff --git a/experiment/test_data/experiment-config.yaml b/experiment/test_data/experiment-config.yaml index 67f556bea..e99dc8978 100644 --- a/experiment/test_data/experiment-config.yaml +++ b/experiment/test_data/experiment-config.yaml @@ -31,6 +31,7 @@ git_hash: "git-hash" no_seeds: false no_dictionaries: false oss_fuzz_corpus: false +random_seed_corpus_dir: null description: "Test experiment" concurrent_builds: null runners_cpus: null diff --git a/experiment/test_run_experiment.py b/experiment/test_run_experiment.py index 47034fbf7..f40c44a85 100644 --- a/experiment/test_run_experiment.py +++ b/experiment/test_run_experiment.py @@ -202,6 +202,7 @@ def test_copy_resources_to_bucket(tmp_path): 'experiment': 'experiment', 'benchmarks': ['libxslt_xpath'], 'oss_fuzz_corpus': True, + 'random_seed_corpus_dir': None, } try: with mock.patch('common.filestore_utils.cp') as mocked_filestore_cp: diff --git a/experiment/test_scheduler.py b/experiment/test_scheduler.py index 02fbbef75..2598f0950 100644 --- a/experiment/test_scheduler.py +++ b/experiment/test_scheduler.py @@ -118,6 +118,7 @@ def test_create_trial_instance(benchmark, expected_image, expected_target, -e NO_SEEDS=False \\ -e NO_DICTIONARIES=False \\ -e OSS_FUZZ_CORPUS=False \\ +-e RANDOM_SEED_CORPUS_DIR=None \\ -e DOCKER_REGISTRY=gcr.io/fuzzbench -e CLOUD_PROJECT=fuzzbench -e CLOUD_COMPUTE_ZONE=us-central1-a \\ -e EXPERIMENT_FILESTORE=gs://experiment-data \\ -e REPORT_FILESTORE=gs://web-reports \\