Emerge-Lab · eugenevinitsky · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/.gitignore b/.gitignore
@@ -148,6 +148,10 @@ dmypy.json
 checkpoints/
 experiments/
 benchmark*/
+!pufferlib/ocean/benchmark/
+!pufferlib/ocean/benchmark/**
+# But re-ignore caches inside it
+pufferlib/ocean/benchmark/**/__pycache__/
 wandb/
 .neptune/
 raylib*/

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ python setup.py build_ext --inplace --force
 
 ## Install (HPC cluster)
 
-For clusters where the host glibc is too old or you need a CUDA toolchain that's not pinned by the OS, PufferDrive uses a **mixed Singularity + venv** layout:
+For the NYU cluster, PufferDrive recommends a **mixed Singularity + venv** layout:
 
 - **Singularity image** (read-only, system-wide): supplies CUDA + cuDNN.
 - **ext3 overlay** (writable via `--fakeroot`, host the miniforge3 base interpreter at `/ext3/miniforge3` only).

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
@@ -190,67 +190,172 @@ show_human_logs = True
 ; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None
 render_map = none
 
-[eval]
-; Set to True to enable periodic multi-scenario evaluation during training
-multi_scenario_eval = False
-; Set to True to enable periodic multi-scenario render during training (one
-; rollout per scenario, output mp4 per scenario via the EGL render pipeline
-; or HTML replay via viz.generate_interactive_replay depending on
-; multi_scenario_render_backend). Does not affect multi_scenario_eval.
-multi_scenario_render = True
-; Epoch interval between render runs. Independent of eval_interval so metric
-; eval can run on a tighter schedule than the more expensive render.
-multi_scenario_render_interval = 250
-; Render backend for multi_scenario_render: "html" (CPU, viz.generate_interactive_replay)
-; or "egl" (C-side render.h → EGL → PBO → ffmpeg libx264, one mp4 per scenario).
-multi_scenario_render_backend = egl
-; Frequency of evaluation during training (in epochs)
-eval_interval = 25
-; When True, inline eval zeroes road-segment dropout + perturbations and
-; enforces red-light stops. Metrics then reflect performance under clean
-; conditions. The live training policy is re-aligned to the clean env's
-; obs shape via _swap_policy_obs_counts — safe because the GigaFlow
-; encoder is count-invariant (shared MLP + max-pool over segments).
-clean_eval = True
-num_agents = 512
-; Batch size for eval_multi_scenarios (number of scenarios per batch)
-; Path to dataset used for evaluation
-map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
-; Simulation mode for evaluation: "gigaflow" or "replay"
-;   gigaflow — procedurally spawn agents on CARLA towns (needs map-only .bin
-;              files in pufferlib/resources/drive/binaries/carla_py123d)
-;   replay   — play logged trajectories from WOMD/nuPlan scenarios (needs
-;              trajectory-bearing .bin files in pufferlib/resources/drive/binaries/womd)
-multi_scenario_simulation_mode = "gigaflow"
-; Total number of scenarios to evaluate
-multi_scenario_num_scenarios = 250
-; Per-scenario step count for replay-mode eval (also used as resample_frequency).
-; 91 = WOMD (9.1s @ 10Hz). 201 = nuPlan (20.1s @ 10Hz). Ignored for gigaflow
-; mode, which always uses a hardcoded 3000-step procedural episode.
-scenario_length = 201
-; Cap the render rollout at this many steps.
-render_max_steps = 201
-backend = PufferEnv
-; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
-; If True, enables evaluation on realism metrics each time we save a checkpoint
-wosac_realism_eval = False
-wosac_num_rollouts = 32  ; Number of policy rollouts per scene
-wosac_init_steps = 10 ; When to start the simulation
-wosac_num_agents = 256  ; Total number of WOSAC agents to evaluate
-wosac_control_mode = "control_wosac"  ; Control the tracks to predict
-wosac_init_mode = "create_all_valid"  ; Initialize from the tracks to predict
-wosac_goal_radius = 2.0 ; Can shrink goal radius for WOSAC evaluation
-wosac_sanity_check = False
-wosac_aggregate_results = True ; Only return aggregate results across all scenes
-; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
-human_replay_eval = False
-human_replay_control_mode = "control_sdc_only" ; Control only the self-driving car
-human_replay_num_agents = 64 ; This equals the number of scenarios, since we control one agent in each
-; Evaluating different driving behaviours learned by the policy
-driving_behaviours_eval = True
-driving_behaviours_eval_config = "pufferlib/config/ocean/driving_behaviours_eval.ini"
-driving_behaviours_eval_interval = 250
-render_driving_behaviours = True
+; ===========================================================================
+; Evaluation suites
+;
+; Each [eval.<name>] section is one Evaluator instance. EvalManager discovers
+; them via auto-discovery (any section under [eval] with a `type` field).
+; Sections without a `type` field are templates — referenced from other
+; sections via `inherits = "<template_name>"`.
+;
+; Field reference:
+;   type           — registered evaluator class (multi_scenario, behavior_class,
+;                    human_replay, wosac)
+;   enabled        — true|false
+;   interval       — epochs between runs (0 disables)
+;   mode           — "inline" (block training) | "subprocess" (spawn process)
+;   inherits       — pull defaults from another section, recursively
+;   clean          — true → zero perturbations + dropout + enforce red lights
+;   render         — true → capture mp4(s) during rollout
+;   render_views   — list of camera views: sim_state, bev, topdown_sim, bev_all
+;   env.<key>      — any [env] override (dotted key)
+;   eval.<key>     — evaluator-specific knob (e.g. num_scenarios)
+;   vec.<key>      — any [vec] override
+; ===========================================================================
+
+[eval.validation_gigaflow]
+type = "multi_scenario"
+enabled = true
+interval = 250
+mode = "inline"
+clean = true
+; One rollout per map (8 carla maps). C-side gigaflow eval cycles maps
+; sequentially within one PufferEnv worker: env_count = min(ceil(num_agents
+; / max_per_env), num_scenarios), so 8 internal envs (one per map) step
+; in parallel via the batched C kernel — no multiprocessing needed.
+; render_num_scenarios decouples the render budget so videos stay cheap.
+render = true
+render_views = ["sim_state", "bev"]
+env.simulation_mode = "gigaflow"
+env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
+env.num_maps = 8
+env.num_agents = 400
+env.min_agents_per_env = 50
+env.max_agents_per_env = 50
+env.scenario_length = 3000
+env.resample_frequency = 3000
+eval.num_scenarios = 8
+eval.render_num_scenarios = 5
+; ~10 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p,
+; so 300 steps × 5 scenarios × 2 views ≈ 17 min — below the eval cadence.
+eval.render_max_steps = 300
+
+; ---------------------------------------------------------------------------
+; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each
+; behavior is one [eval.behaviors_*] section. All inherit from the template
+; below — change shared knobs in one place.
+; ---------------------------------------------------------------------------
+
+[eval.behaviors_defaults]
+; Template — no `type`, never instantiated directly. Other sections inherit.
+enabled = false
+interval = 250
+mode = "inline"
+clean = true
+render = true
+render_views = ["sim_state", "bev"]
+env.simulation_mode = "replay"
+env.control_mode = "control_sdc_only"
+env.init_mode = "create_all_valid"
+env.scenario_length = 201
+env.max_partner_observations = 32
+eval.num_scenarios = 50
+; Render budget per epoch (metrics still use the full num_scenarios).
+; Defaults to min(num_scenarios, 3); pin lower if even 3 mp4s × 12 classes
+; × 2 views is too much wandb traffic.
+eval.render_num_scenarios = 2
+
+[eval.behaviors_full_dir]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins"
+
+[eval.behaviors_hard_stop]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop"
+
+[eval.behaviors_highway_straight]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight"
+
+[eval.behaviors_lane_change]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change"
+
+[eval.behaviors_merge]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge"
+
+[eval.behaviors_parked_cars]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars"
+
+[eval.behaviors_roundabout]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout"
+
+[eval.behaviors_stopped_traffic]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic"
+
+[eval.behaviors_traffic_light_green]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green"
+
+[eval.behaviors_traffic_light_stop]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop"
+
+[eval.behaviors_unprotected_left]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left"
+
+[eval.behaviors_unprotected_right]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right"
+
+; ---------------------------------------------------------------------------
+; Optional: WOSAC realism eval. Off by default.
+; ---------------------------------------------------------------------------
+
+[eval.wosac]
+type = "wosac"
+enabled = false
+interval = 500
+mode = "subprocess"
+clean = true
+render = false
+env.control_mode = "control_wosac"
+env.init_mode = "create_all_valid"
+env.init_steps = 10
+env.goal_radius = 2.0
+eval.wosac_num_rollouts = 32
+eval.wosac_num_agents = 256
+eval.wosac_sanity_check = false
+eval.wosac_aggregate_results = true
 
 ; [sweep.train.learning_rate]
 ; distribution = log_normal

diff --git a/pufferlib/config/ocean/driving_behaviours_eval.ini b/pufferlib/config/ocean/driving_behaviours_eval.ini
diff --git a/pufferlib/ocean/benchmark/evaluators/__init__.py b/pufferlib/ocean/benchmark/evaluators/__init__.py
@@ -0,0 +1,33 @@
+"""Unified evaluator framework for PufferDrive.
+
+Each Evaluator subclass owns one rollout pattern. The EvalManager (parent
+package) discovers evaluators from `[eval.<name>]` sections in drive.ini
+and dispatches them inline (during training) or as subprocesses.
+
+See docs/eval_unification.md for the full design rationale.
+"""
+
+from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+from pufferlib.ocean.benchmark.evaluators.behavior_class import BehaviorClassEvaluator
+from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator
+from pufferlib.ocean.benchmark.evaluators.multi_scenario import MultiScenarioEvaluator
+from pufferlib.ocean.benchmark.evaluators.wosac import WOSACEvaluator
+
+# Type registry for [eval.<name>].type → class lookup. Manager uses this
+# to instantiate the right subclass per config section.
+EVALUATOR_REGISTRY = {
+    "multi_scenario": MultiScenarioEvaluator,
+    "behavior_class": BehaviorClassEvaluator,
+    "human_replay": HumanReplayEvaluator,
+    "wosac": WOSACEvaluator,
+}
+
+__all__ = [
+    "EVALUATOR_REGISTRY",
+    "EvalResult",
+    "Evaluator",
+    "MultiScenarioEvaluator",
+    "BehaviorClassEvaluator",
+    "HumanReplayEvaluator",
+    "WOSACEvaluator",
+]