Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
32738fb
[WIP] eval: add unified Evaluator + EvalManager (no functional integr…
May 9, 2026
7218f5c
[WIP] eval: replace [eval] with [eval.<name>] sections in drive.ini
May 9, 2026
0360a83
[WIP] eval: rip out legacy eval functions, wire EvalManager into PuffeRL
May 9, 2026
8815315
[WIP] eval: explicit goal_advance_mode knob; replaces if-SIMULATION_R…
May 9, 2026
a6dd740
README: cluster section now framed for NYU specifically
May 9, 2026
170183b
ruff-format: collapse goal_advance_mode raise to one line
May 9, 2026
eb21afd
[WIP] eval: factor shared rollout loop into Evaluator base class
May 9, 2026
aeb9dd5
[WIP] eval: untrack accidentally-committed pycache; harden gitignore
May 9, 2026
3f28b08
tests: add 3-level inheritance + self-cycle cases for EvalManager parser
May 9, 2026
8b35194
[WIP] eval: render budget knob + random scenario selection per epoch
May 10, 2026
1537f56
[WIP] eval: subprocess evals see fresh checkpoint, not stale resume path
May 10, 2026
7af739a
[WIP] eval: revert goal_advance_mode C knob — defer to a separate PR
May 10, 2026
1153abe
[WIP] eval: tier-A tests — dispatch, info-shape, behavior cleanup, ov…
May 10, 2026
1cfbcc3
drive.ini: bump validation_gigaflow interval 25 → 250
May 10, 2026
c2fa176
[WIP] eval: log eval_seconds per evaluator
May 10, 2026
38a9c05
drive.py: fix missing goal_radius in resample-time binding.shared
May 10, 2026
6514e8d
drive.ini: merge validation_gigaflow + validation_gigaflow_render
May 10, 2026
2b07414
[WIP] eval: fix render-path vec.make signature for PufferEnv backend
May 10, 2026
9b7f9d6
[WIP] eval: validation_gigaflow runs 1 scenario per map, not 250
May 10, 2026
5b7997e
[WIP] eval: cap render clip length, decouple from metric-pass scenari…
May 10, 2026
ec0698a
[WIP] eval: bump render_max_steps default 91 → 300 (10s clip vs 3s)
May 10, 2026
cf955d9
[WIP] eval: default to PufferEnv inline, keep Multiprocessing opt-in
May 10, 2026
725475f
[WIP] eval: render returns only this-pass mp4s; nuplan paths use real…
May 10, 2026
38eec92
[WIP] eval: stamp global_step into render mp4 filenames
May 10, 2026
0e83577
[WIP] eval: lift render to base, behaviors now render
May 10, 2026
7253022
[WIP] eval: render output into per-evaluator subdir
May 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ dmypy.json
checkpoints/
experiments/
benchmark*/
!pufferlib/ocean/benchmark/
!pufferlib/ocean/benchmark/**
# But re-ignore caches inside it
pufferlib/ocean/benchmark/**/__pycache__/
wandb/
.neptune/
raylib*/
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ python setup.py build_ext --inplace --force

## Install (HPC cluster)

For clusters where the host glibc is too old or you need a CUDA toolchain that's not pinned by the OS, PufferDrive uses a **mixed Singularity + venv** layout:
For the NYU cluster, PufferDrive recommends a **mixed Singularity + venv** layout:

- **Singularity image** (read-only, system-wide): supplies CUDA + cuDNN.
- **ext3 overlay** (writable via `--fakeroot`, host the miniforge3 base interpreter at `/ext3/miniforge3` only).
Expand Down
227 changes: 166 additions & 61 deletions pufferlib/config/ocean/drive.ini
Original file line number Diff line number Diff line change
Expand Up @@ -190,67 +190,172 @@ show_human_logs = True
; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None
render_map = none

[eval]
; Set to True to enable periodic multi-scenario evaluation during training
multi_scenario_eval = False
; Set to True to enable periodic multi-scenario render during training (one
; rollout per scenario, output mp4 per scenario via the EGL render pipeline
; or HTML replay via viz.generate_interactive_replay depending on
; multi_scenario_render_backend). Does not affect multi_scenario_eval.
multi_scenario_render = True
; Epoch interval between render runs. Independent of eval_interval so metric
; eval can run on a tighter schedule than the more expensive render.
multi_scenario_render_interval = 250
; Render backend for multi_scenario_render: "html" (CPU, viz.generate_interactive_replay)
; or "egl" (C-side render.h → EGL → PBO → ffmpeg libx264, one mp4 per scenario).
multi_scenario_render_backend = egl
; Frequency of evaluation during training (in epochs)
eval_interval = 25
; When True, inline eval zeroes road-segment dropout + perturbations and
; enforces red-light stops. Metrics then reflect performance under clean
; conditions. The live training policy is re-aligned to the clean env's
; obs shape via _swap_policy_obs_counts — safe because the GigaFlow
; encoder is count-invariant (shared MLP + max-pool over segments).
clean_eval = True
num_agents = 512
; Batch size for eval_multi_scenarios (number of scenarios per batch)
; Path to dataset used for evaluation
map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
; Simulation mode for evaluation: "gigaflow" or "replay"
; gigaflow — procedurally spawn agents on CARLA towns (needs map-only .bin
; files in pufferlib/resources/drive/binaries/carla_py123d)
; replay — play logged trajectories from WOMD/nuPlan scenarios (needs
; trajectory-bearing .bin files in pufferlib/resources/drive/binaries/womd)
multi_scenario_simulation_mode = "gigaflow"
; Total number of scenarios to evaluate
multi_scenario_num_scenarios = 250
; Per-scenario step count for replay-mode eval (also used as resample_frequency).
; 91 = WOMD (9.1s @ 10Hz). 201 = nuPlan (20.1s @ 10Hz). Ignored for gigaflow
; mode, which always uses a hardcoded 3000-step procedural episode.
scenario_length = 201
; Cap the render rollout at this many steps.
render_max_steps = 201
backend = PufferEnv
; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
; If True, enables evaluation on realism metrics each time we save a checkpoint
wosac_realism_eval = False
wosac_num_rollouts = 32 ; Number of policy rollouts per scene
wosac_init_steps = 10 ; When to start the simulation
wosac_num_agents = 256 ; Total number of WOSAC agents to evaluate
wosac_control_mode = "control_wosac" ; Control the tracks to predict
wosac_init_mode = "create_all_valid" ; Initialize from the tracks to predict
wosac_goal_radius = 2.0 ; Can shrink goal radius for WOSAC evaluation
wosac_sanity_check = False
wosac_aggregate_results = True ; Only return aggregate results across all scenes
; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
human_replay_eval = False
human_replay_control_mode = "control_sdc_only" ; Control only the self-driving car
human_replay_num_agents = 64 ; This equals the number of scenarios, since we control one agent in each
; Evaluating different driving behaviours learned by the policy
driving_behaviours_eval = True
driving_behaviours_eval_config = "pufferlib/config/ocean/driving_behaviours_eval.ini"
driving_behaviours_eval_interval = 250
render_driving_behaviours = True
; ===========================================================================
; Evaluation suites
;
; Each [eval.<name>] section is one Evaluator instance. EvalManager discovers
; them via auto-discovery (any section under [eval] with a `type` field).
; Sections without a `type` field are templates — referenced from other
; sections via `inherits = "<template_name>"`.
;
; Field reference:
; type — registered evaluator class (multi_scenario, behavior_class,
; human_replay, wosac)
; enabled — true|false
; interval — epochs between runs (0 disables)
; mode — "inline" (block training) | "subprocess" (spawn process)
; inherits — pull defaults from another section, recursively
; clean — true → zero perturbations + dropout + enforce red lights
; render — true → capture mp4(s) during rollout
; render_views — list of camera views: sim_state, bev, topdown_sim, bev_all
; env.<key> — any [env] override (dotted key)
; eval.<key> — evaluator-specific knob (e.g. num_scenarios)
; vec.<key> — any [vec] override
; ===========================================================================

[eval.validation_gigaflow]
type = "multi_scenario"
enabled = true
interval = 250
mode = "inline"
clean = true
; One rollout per map (8 carla maps). C-side gigaflow eval cycles maps
; sequentially within one PufferEnv worker: env_count = min(ceil(num_agents
; / max_per_env), num_scenarios), so 8 internal envs (one per map) step
; in parallel via the batched C kernel — no multiprocessing needed.
; render_num_scenarios decouples the render budget so videos stay cheap.
render = true
render_views = ["sim_state", "bev"]
env.simulation_mode = "gigaflow"
env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
env.num_maps = 8
env.num_agents = 400
env.min_agents_per_env = 50
env.max_agents_per_env = 50
env.scenario_length = 3000
env.resample_frequency = 3000
eval.num_scenarios = 8
eval.render_num_scenarios = 5
; ~10 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p,
; so 300 steps × 5 scenarios × 2 views ≈ 17 min — below the eval cadence.
eval.render_max_steps = 300

; ---------------------------------------------------------------------------
; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each
; behavior is one [eval.behaviors_*] section. All inherit from the template
; below — change shared knobs in one place.
; ---------------------------------------------------------------------------

[eval.behaviors_defaults]
; Template — no `type`, never instantiated directly. Other sections inherit.
enabled = false
interval = 250
mode = "inline"
clean = true
render = true
render_views = ["sim_state", "bev"]
env.simulation_mode = "replay"
env.control_mode = "control_sdc_only"
env.init_mode = "create_all_valid"
env.scenario_length = 201
env.max_partner_observations = 32
eval.num_scenarios = 50
; Render budget per epoch (metrics still use the full num_scenarios).
; Defaults to min(num_scenarios, 3); pin lower if even 3 mp4s × 12 classes
; × 2 views is too much wandb traffic.
eval.render_num_scenarios = 2

[eval.behaviors_full_dir]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins"

[eval.behaviors_hard_stop]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop"

[eval.behaviors_highway_straight]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight"

[eval.behaviors_lane_change]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change"

[eval.behaviors_merge]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge"

[eval.behaviors_parked_cars]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars"

[eval.behaviors_roundabout]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout"

[eval.behaviors_stopped_traffic]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic"

[eval.behaviors_traffic_light_green]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green"

[eval.behaviors_traffic_light_stop]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop"

[eval.behaviors_unprotected_left]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left"

[eval.behaviors_unprotected_right]
inherits = "behaviors_defaults"
type = "behavior_class"
enabled = true
env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right"

; ---------------------------------------------------------------------------
; Optional: WOSAC realism eval. Off by default.
; ---------------------------------------------------------------------------

[eval.wosac]
type = "wosac"
enabled = false
interval = 500
mode = "subprocess"
clean = true
render = false
env.control_mode = "control_wosac"
env.init_mode = "create_all_valid"
env.init_steps = 10
env.goal_radius = 2.0
eval.wosac_num_rollouts = 32
eval.wosac_num_agents = 256
eval.wosac_sanity_check = false
eval.wosac_aggregate_results = true

; [sweep.train.learning_rate]
; distribution = log_normal
Expand Down
64 changes: 0 additions & 64 deletions pufferlib/config/ocean/driving_behaviours_eval.ini

This file was deleted.

33 changes: 33 additions & 0 deletions pufferlib/ocean/benchmark/evaluators/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Unified evaluator framework for PufferDrive.

Each Evaluator subclass owns one rollout pattern. The EvalManager (parent
package) discovers evaluators from `[eval.<name>]` sections in drive.ini
and dispatches them inline (during training) or as subprocesses.

See docs/eval_unification.md for the full design rationale.
"""

from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
from pufferlib.ocean.benchmark.evaluators.behavior_class import BehaviorClassEvaluator
from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator
from pufferlib.ocean.benchmark.evaluators.multi_scenario import MultiScenarioEvaluator
from pufferlib.ocean.benchmark.evaluators.wosac import WOSACEvaluator

# Type registry for [eval.<name>].type → class lookup. Manager uses this
# to instantiate the right subclass per config section.
EVALUATOR_REGISTRY = {
"multi_scenario": MultiScenarioEvaluator,
"behavior_class": BehaviorClassEvaluator,
"human_replay": HumanReplayEvaluator,
"wosac": WOSACEvaluator,
}

__all__ = [
"EVALUATOR_REGISTRY",
"EvalResult",
"Evaluator",
"MultiScenarioEvaluator",
"BehaviorClassEvaluator",
"HumanReplayEvaluator",
"WOSACEvaluator",
]
Loading
Loading