From 32738fb5c2bfca6ede8d9e8c3e298a47262dfd6e Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:27:45 -0400 Subject: [PATCH 01/26] [WIP] eval: add unified Evaluator + EvalManager (no functional integration yet) Adds the new evaluator framework as pure additions: pufferlib/ocean/benchmark/evaluators/ base.py Evaluator + EvalResult dataclass multi_scenario.py MultiScenarioEvaluator (replaces eval_multi_scenarios) human_replay.py HumanReplayEvaluator (replay + control_sdc_only loop) behavior_class.py BehaviorClassEvaluator (per-class nuPlan suite) wosac.py Thin wrapper around the existing WOSACEvaluator pufferlib/ocean/benchmark/manager.py EvalManager: section discovery, inheritance chain, clean-eval macro, dotted-key flattening, inline + subprocess dispatch, wandb logging. Plus tests/test_eval_manager.py for the config-merge logic. See docs/eval_unification.md for the full design rationale. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 2 + .../ocean/benchmark/evaluators/__init__.py | 33 +++ pufferlib/ocean/benchmark/evaluators/base.py | 76 +++++ .../benchmark/evaluators/behavior_class.py | 57 ++++ .../benchmark/evaluators/human_replay.py | 79 +++++ .../benchmark/evaluators/multi_scenario.py | 207 ++++++++++++++ pufferlib/ocean/benchmark/evaluators/wosac.py | 42 +++ pufferlib/ocean/benchmark/manager.py | 269 ++++++++++++++++++ tests/test_eval_manager.py | 122 ++++++++ 9 files changed, 887 insertions(+) create mode 100644 pufferlib/ocean/benchmark/evaluators/__init__.py create mode 100644 pufferlib/ocean/benchmark/evaluators/base.py create mode 100644 pufferlib/ocean/benchmark/evaluators/behavior_class.py create mode 100644 pufferlib/ocean/benchmark/evaluators/human_replay.py create mode 100644 pufferlib/ocean/benchmark/evaluators/multi_scenario.py create mode 100644 pufferlib/ocean/benchmark/evaluators/wosac.py create mode 100644 pufferlib/ocean/benchmark/manager.py create mode 100644 tests/test_eval_manager.py diff --git a/.gitignore b/.gitignore index 4ca0ece3c..782cfdf36 100644 --- a/.gitignore +++ b/.gitignore @@ -148,6 +148,8 @@ dmypy.json checkpoints/ experiments/ benchmark*/ +!pufferlib/ocean/benchmark/ +!pufferlib/ocean/benchmark/** wandb/ .neptune/ raylib*/ diff --git a/pufferlib/ocean/benchmark/evaluators/__init__.py b/pufferlib/ocean/benchmark/evaluators/__init__.py new file mode 100644 index 000000000..d7594bffc --- /dev/null +++ b/pufferlib/ocean/benchmark/evaluators/__init__.py @@ -0,0 +1,33 @@ +"""Unified evaluator framework for PufferDrive. + +Each Evaluator subclass owns one rollout pattern. The EvalManager (parent +package) discovers evaluators from `[eval.]` sections in drive.ini +and dispatches them inline (during training) or as subprocesses. + +See docs/eval_unification.md for the full design rationale. +""" + +from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator +from pufferlib.ocean.benchmark.evaluators.behavior_class import BehaviorClassEvaluator +from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator +from pufferlib.ocean.benchmark.evaluators.multi_scenario import MultiScenarioEvaluator +from pufferlib.ocean.benchmark.evaluators.wosac import WOSACEvaluator + +# Type registry for [eval.].type → class lookup. Manager uses this +# to instantiate the right subclass per config section. +EVALUATOR_REGISTRY = { + "multi_scenario": MultiScenarioEvaluator, + "behavior_class": BehaviorClassEvaluator, + "human_replay": HumanReplayEvaluator, + "wosac": WOSACEvaluator, +} + +__all__ = [ + "EVALUATOR_REGISTRY", + "EvalResult", + "Evaluator", + "MultiScenarioEvaluator", + "BehaviorClassEvaluator", + "HumanReplayEvaluator", + "WOSACEvaluator", +] diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py new file mode 100644 index 000000000..066c1919d --- /dev/null +++ b/pufferlib/ocean/benchmark/evaluators/base.py @@ -0,0 +1,76 @@ +"""Evaluator base class + EvalResult dataclass.""" + +from dataclasses import dataclass, field +from pathlib import Path +from typing import ClassVar + + +@dataclass +class EvalResult: + metrics: dict + frames: list = field(default_factory=list) + + +class Evaluator: + """Base class for all evaluators. + + Subclasses set `type_name` (the value used in `[eval.].type`) and + implement `rollout()`. Optionally override `env_overrides()`, + `vec_overrides()`, and `aggregate()`. + """ + + type_name: ClassVar[str] = "" + + def __init__(self, name: str, config: dict, train_config: dict): + # `name` = the [eval.] section name. Used as the wandb prefix. + self.name = name + # `config` = merged per-evaluator config (after inheritance + clean + # macro expansion). Has nested `env`, `vec`, plus flat scalar knobs. + self.config = config + # `train_config` = the full training config from drive.ini, used as + # the base layer that `config` overrides on top of. + self.train_config = train_config + + # Common scalars pulled out for ergonomics. + self.enabled: bool = bool(config.get("enabled", True)) + self.interval: int = int(config.get("interval", 0)) + self.mode: str = config.get("mode", "inline") + self.render: bool = bool(config.get("render", False)) + self.render_views: list = list(config.get("render_views", ["sim_state"])) + self.clean: bool = bool(config.get("clean", True)) + + def env_overrides(self) -> dict: + """Per-evaluator [env] overrides. Defaults to whatever the section + wrote under `env.*`. Subclasses can override to add baseline knobs.""" + return dict(self.config.get("env", {})) + + def vec_overrides(self) -> dict: + """Per-evaluator [vec] overrides. Default: serial single-worker — + the safe default for replay-style evals where each worker is a + single bin replay. Subclasses that want parallel throughput + (gigaflow validation) override this.""" + base = {"backend": "PufferEnv", "num_envs": 1} + base.update(self.config.get("vec", {})) + return base + + def rollout(self, vecenv, policy, args) -> EvalResult: + raise NotImplementedError + + def aggregate(self, per_rollout: list) -> dict: + """Reduce a list of per-rollout dicts to a single metrics dict. + + Default: numeric mean over keys present in any sub-dict. WOSAC + overrides for likelihood-style aggregation.""" + import numpy as np + + if not per_rollout: + return {} + keys = set() + for r in per_rollout: + keys.update(r.keys()) + out = {} + for k in keys: + vals = [r[k] for r in per_rollout if k in r and isinstance(r[k], (int, float))] + if vals: + out[k] = float(np.mean(vals)) + return out diff --git a/pufferlib/ocean/benchmark/evaluators/behavior_class.py b/pufferlib/ocean/benchmark/evaluators/behavior_class.py new file mode 100644 index 000000000..2d3a7f45c --- /dev/null +++ b/pufferlib/ocean/benchmark/evaluators/behavior_class.py @@ -0,0 +1,57 @@ +"""BehaviorClassEvaluator — one nuPlan behavior category at a time. + +Runs a HumanReplayEvaluator-style rollout against a single map_dir, with +optional fresh random sampling each pass when `num_scenarios` < total bins. +""" + +import os +import random +import shutil +import tempfile +from typing import ClassVar + +from pufferlib.ocean.benchmark.evaluators.base import EvalResult +from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator + + +class BehaviorClassEvaluator(HumanReplayEvaluator): + type_name: ClassVar[str] = "behavior_class" + + def __init__(self, name, config, train_config): + super().__init__(name, config, train_config) + self._sampled_dir = None # tmp symlink dir created per pass + + def env_overrides(self) -> dict: + # Reuse HumanReplay's defaults, then handle the random-sampling + # cap. If num_scenarios is smaller than total bins, build a tmp + # symlink dir with a fresh sample each pass and point map_dir there. + env = super().env_overrides() + map_dir = env.get("map_dir", "") + if not map_dir or not os.path.isdir(map_dir): + return env + + num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 0)) + all_bins = [f for f in os.listdir(map_dir) if f.endswith(".bin")] + if num_scenarios > 0 and num_scenarios < len(all_bins): + sampled = random.sample(all_bins, num_scenarios) + self._sampled_dir = tempfile.mkdtemp(prefix=f"{self.name}_") + for fname in sampled: + os.symlink(os.path.join(map_dir, fname), os.path.join(self._sampled_dir, fname)) + env["map_dir"] = self._sampled_dir + env["num_agents"] = num_scenarios + env["num_maps"] = num_scenarios + else: + env["num_agents"] = len(all_bins) + env["num_maps"] = len(all_bins) + return env + + def rollout(self, vecenv, policy, args) -> EvalResult: + result = super().rollout(vecenv, policy, args) + # Manager owns the cleanup window — defer rmtree until after vecenv.close + # so any open file descriptors on the symlinks are released first. + return result + + def cleanup(self): + if self._sampled_dir and os.path.isdir(self._sampled_dir): + shutil.rmtree(self._sampled_dir, ignore_errors=True) + self._sampled_dir = None diff --git a/pufferlib/ocean/benchmark/evaluators/human_replay.py b/pufferlib/ocean/benchmark/evaluators/human_replay.py new file mode 100644 index 000000000..28e19e22b --- /dev/null +++ b/pufferlib/ocean/benchmark/evaluators/human_replay.py @@ -0,0 +1,79 @@ +"""HumanReplayEvaluator — replay mode + control_sdc_only, one rollout per +bin in the map_dir, mean of per-episode info dicts.""" + +import os +from typing import ClassVar + +import numpy as np +import torch + +import pufferlib +from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator + + +class HumanReplayEvaluator(Evaluator): + type_name: ClassVar[str] = "human_replay" + + def env_overrides(self) -> dict: + env = { + "simulation_mode": "replay", + "control_mode": "control_sdc_only", + "init_mode": "create_all_valid", + "eval_mode": 1, + "termination_mode": 0, + "reward_randomization": False, + } + env.update(self.config.get("env", {})) + # num_agents = number of bins so each gets one episode slot + if "num_agents" not in env: + map_dir = env.get("map_dir", "") + if map_dir and os.path.isdir(map_dir): + env["num_agents"] = len([f for f in os.listdir(map_dir) if f.endswith(".bin")]) + env["num_maps"] = env["num_agents"] + return env + + def rollout(self, vecenv, policy, args) -> EvalResult: + device = args["train"]["device"] + scenario_length = int(args["env"]["scenario_length"]) + init_steps = int(args["env"].get("init_steps", 0)) + num_maps = int(args["env"]["num_maps"]) + num_agents = vecenv.observation_space.shape[0] + + # +1 step margin: env emits done on the step after scenario_length. + total_steps = (scenario_length - init_steps + 1) * num_maps + + obs, _ = vecenv.reset() + state = {} + if args["train"]["use_rnn"]: + state = dict( + lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), + lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), + ) + + all_infos = [] + for _ in range(total_steps): + with torch.no_grad(): + ob_t = torch.as_tensor(obs).to(device) + logits, _ = policy.forward_eval(ob_t, state) + action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) + action_np = action.cpu().numpy().reshape(vecenv.action_space.shape) + if isinstance(logits, torch.distributions.Normal): + action_np = np.clip(action_np, vecenv.action_space.low, vecenv.action_space.high) + obs, _, _, _, info_list = vecenv.step(action_np) + if info_list: + all_infos.extend(info_list) + # Stop once every bin has yielded one info to avoid double-counting + # on the second cycle through the dir. + if len(all_infos) >= num_maps: + break + + if not all_infos: + return EvalResult(metrics={"num_scenarios_completed": 0}) + + metrics = {"num_scenarios_completed": float(len(all_infos))} + keys = set().union(*(d.keys() for d in all_infos)) + for k in keys: + vals = [d[k] for d in all_infos if isinstance(d.get(k), (int, float))] + if vals: + metrics[k] = float(np.mean(vals)) + return EvalResult(metrics=metrics, frames=[]) diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py new file mode 100644 index 000000000..365505bfd --- /dev/null +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -0,0 +1,207 @@ +"""MultiScenarioEvaluator — distribute scenarios across workers, one rollout +per scenario, mean per-scenario metrics.""" + +import contextlib +import os +import time +from pathlib import Path + +import numpy as np +import torch +import tqdm + +import pufferlib +from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator + + +class MultiScenarioEvaluator(Evaluator): + type_name = "multi_scenario" + + def vec_overrides(self) -> dict: + # Multi-worker by default for throughput. Override via [eval..vec]. + backend = self.train_config.get("vec", {}).get("backend", "PufferEnv") + num_envs = int(self.config.get("vec", {}).get("num_envs", 1)) + return {"backend": backend, "num_envs": num_envs} + + def env_overrides(self) -> dict: + # Sensible defaults for the gigaflow path; replay configs are expected + # to set the relevant knobs in [eval..env.*]. + env = { + "eval_mode": 1, + "termination_mode": 0, + "reward_randomization": False, + } + env.update(self.config.get("env", {})) + return env + + def rollout(self, vecenv, policy, args) -> EvalResult: + t0 = time.time() + num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1)) + scenario_length = int(args["env"].get("scenario_length", 91)) + device = args["train"]["device"] + num_agents = vecenv.observation_space.shape[0] + + global_infos = {} + + # LSTM hidden state shared across the rollout; reset each scenario batch. + state = {} + if args["train"]["use_rnn"]: + state = dict( + lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), + lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), + ) + + vecenv.async_reset(args.get("seed", 42)) + ob, _, _, _, infos, _, _ = vecenv.recv() + scenarios_processed = 0 + with tqdm.tqdm(total=num_scenarios, desc=f"[{self.name}] scenarios", disable=args.get("quiet", False)) as pbar: + while scenarios_processed < num_scenarios: + if args["train"]["use_rnn"]: + state["lstm_h"].zero_() + state["lstm_c"].zero_() + + for _ in range(scenario_length): + with torch.no_grad(): + ob_t = torch.as_tensor(ob).to(device) + logits, _ = policy.forward_eval(ob_t, state) + action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) + action = action.cpu().numpy().reshape(vecenv.action_space.shape) + if isinstance(logits, torch.distributions.Normal): + action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) + + ob, _, _, _, infos = vecenv.step(action) + + if infos and infos[0]: + for sub_env in infos: + for env_idx, summary in enumerate(sub_env): + map_name = summary["map_name"].split("/")[-1].split(".")[0] + summary["episode_id"] = env_idx + summary["map_name"] = map_name + scenarios_processed += 1 + pbar.update(1) + for k, v in summary.items(): + global_infos.setdefault(k, []).append(v) + + metrics = self._average(global_infos) + if not args.get("quiet", False): + print(f"[{self.name}] {scenarios_processed} scenarios in {time.time() - t0:.1f}s") + + frames = [] + if self.render: + frames = self._render_pass(vecenv, policy, args) + + return EvalResult(metrics=metrics, frames=frames) + + def _average(self, global_infos: dict) -> dict: + out = {} + import numbers + + for k, vs in global_infos.items(): + if k == "num_scenarios": + out[k] = float(np.sum(vs)) + elif vs and isinstance(vs[0], numbers.Number): + out[k] = float(np.mean(vs)) + return out + + def _render_pass(self, vecenv, policy, args) -> list: + """One rollout per view, all writing mp4s to a single dir. + + Re-uses the same vecenv if it's a single-worker setup; otherwise + delegates to a serial render env built fresh per view. + """ + import importlib + + env_name = args["env_name"] + backend = args.get("render_backend", "egl") + if backend != "egl": + return [] + + out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4" + out_dir.mkdir(parents=True, exist_ok=True) + + # Render with a fresh single-worker env so frame capture is sequential + # and starting_map_counter starts at 0. Multi-worker render doesn't + # match the C-side ffmpeg-per-env wiring cleanly. + package = args.get("package", "ocean") + module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}" + env_module = importlib.import_module(module_name) + make_env = env_module.env_creator(env_name) + + render_env_kwargs = dict(args["env"]) + render_env_kwargs["render_mode"] = "headless" + + all_paths = [] + for view in self.render_views: + view_idx = _VIEW_NAME_TO_IDX.get(view, 0) + view_suffix = "" if view == "sim_state" else f"_{view}" + + vec = pufferlib.vector.make( + [make_env], + env_args=[[]], + env_kwargs=[render_env_kwargs], + backend="PufferEnv", + num_envs=1, + num_workers=1, + batch_size=1, + ) + target = vec if not hasattr(vec, "envs") else vec.envs[0] + internal = getattr(target, "num_envs", 1) + for e in range(internal): + target.set_video_suffix(view_suffix, env_idx=e) + + paths = self._render_view(vec, target, policy, args, view_idx, out_dir) + vec.close() + all_paths.extend(paths) + return all_paths + + def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: Path) -> list: + device = args["train"]["device"] + num_agents = vecenv.observation_space.shape[0] + num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1)) + max_steps = args.get("render_max_steps") or int(args["env"].get("scenario_length", 91)) + + saved_cwd = os.getcwd() + os.chdir(out_dir) + try: + state = {} + if args["train"]["use_rnn"]: + state = dict( + lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), + lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), + ) + scenarios_processed = 0 + while scenarios_processed < num_scenarios: + ob, _ = vecenv.reset() + scenarios = vecenv.get_state() + num_in_batch = len(scenarios) + remaining = num_scenarios - scenarios_processed - num_in_batch + target_env.batch_size_eval = max(1, remaining) + if args["train"]["use_rnn"]: + state["lstm_h"].zero_() + state["lstm_c"].zero_() + for _ in range(max_steps): + with torch.no_grad(): + ob_t = torch.as_tensor(ob).to(device) + logits, _ = policy.forward_eval(ob_t, state) + action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) + action = action.cpu().numpy().reshape(vecenv.action_space.shape) + if isinstance(logits, torch.distributions.Normal): + action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) + ob, _, _, _, _ = vecenv.step(action) + for e in range(num_in_batch): + target_env.render(env_idx=e, view_mode=view_idx) + for e in range(num_in_batch): + target_env.close_client(env_idx=e) + scenarios_processed += num_in_batch + finally: + os.chdir(saved_cwd) + + return sorted(p for p in out_dir.glob("*.mp4")) + + +_VIEW_NAME_TO_IDX = { + "sim_state": 0, + "bev": 1, + "topdown_sim": 2, + "bev_all": 3, +} diff --git a/pufferlib/ocean/benchmark/evaluators/wosac.py b/pufferlib/ocean/benchmark/evaluators/wosac.py new file mode 100644 index 000000000..8733c8a2a --- /dev/null +++ b/pufferlib/ocean/benchmark/evaluators/wosac.py @@ -0,0 +1,42 @@ +"""WOSACEvaluator — Waymo Open Sim Agents Challenge realism eval. + +Wraps the existing WOSACEvaluator class in benchmark/evaluator.py — that +file owns the realism math (per-feature likelihood under learned +estimators) and the per-scene multi-rollout structure. This adapter +fits it into the unified Evaluator interface. +""" + +from typing import ClassVar + +from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator + + +class WOSACEvaluator(Evaluator): + type_name: ClassVar[str] = "wosac" + + def env_overrides(self) -> dict: + env = { + "control_mode": "control_wosac", + "init_mode": "create_all_valid", + "eval_mode": 1, + "termination_mode": 0, + "reward_randomization": False, + } + env.update(self.config.get("env", {})) + return env + + def rollout(self, vecenv, policy, args) -> EvalResult: + # Inner class pulls pandas/matplotlib — keep the import inside the + # rollout so the wrapper class can be imported in environments + # that don't have those (e.g. unit-test smoke envs). + from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator as _WOSACInner + + inner = _WOSACInner(args) + df = inner.evaluate(args, vecenv, policy) + # df has one row per scene; aggregate to a single dict. + results = df.mean(numeric_only=True).to_dict() + results["total_num_agents"] = float(df["num_agents_per_scene"].sum()) + results["total_unique_scenarios"] = float(df.index.unique().shape[0]) + results["realism_meta_score_std"] = float(df["realism_meta_score"].std()) + results = {k: (float(v) if hasattr(v, "item") else v) for k, v in results.items()} + return EvalResult(metrics=results, frames=[]) diff --git a/pufferlib/ocean/benchmark/manager.py b/pufferlib/ocean/benchmark/manager.py new file mode 100644 index 000000000..86dedaacc --- /dev/null +++ b/pufferlib/ocean/benchmark/manager.py @@ -0,0 +1,269 @@ +"""EvalManager — discovers `[eval.]` sections, instantiates Evaluators, +dispatches them inline or as subprocesses, logs results. + +Config schema (see docs/eval_unification.md): + + [eval.] + type = "" + enabled = true|false + interval = + mode = "inline" | "subprocess" + inherits = "" # optional, recursive merge + clean = true|false + render = true|false + render_views = ["sim_state", ...] + env. = # any [env] override + eval. = # evaluator-specific knobs + vec. = # any [vec] override + +Sections without a `type` field are templates (only usable via `inherits`). +""" + +import copy +import importlib +import json +import os +import subprocess +import sys +import time +from pathlib import Path + +import pufferlib + +from pufferlib.ocean.benchmark.evaluators import EVALUATOR_REGISTRY, EvalResult, Evaluator + +# clean_eval macro — env knobs to zero/enforce. Per-section explicit values +# win over the macro (see _build_section_config). +CLEAN_EVAL_OVERRIDES = { + "lane_segment_dropout": 0.0, + "boundary_segment_dropout": 0.0, + "partner_blindness_prob": 0.0, + "phantom_braking_prob": 0.0, + "phantom_braking_trigger_prob": 0.0, + "traffic_light_behavior": 1, +} + + +class EvalManager: + def __init__(self, evaluators: list, train_config: dict): + self.evaluators = evaluators + self.train_config = train_config + + @classmethod + def from_config(cls, train_config: dict) -> "EvalManager": + sections = _discover_eval_sections(train_config) + evaluators = [] + for name, raw in sections.items(): + cfg = _build_section_config(name, raw, sections) + type_name = cfg.get("type") + if type_name is None: + # Template section — referenced via inherits but not instantiated. + continue + cls_for_type = EVALUATOR_REGISTRY.get(type_name) + if cls_for_type is None: + raise ValueError( + f"[eval.{name}] type='{type_name}' is not registered. " + f"Known types: {sorted(EVALUATOR_REGISTRY.keys())}" + ) + evaluators.append(cls_for_type(name=name, config=cfg, train_config=train_config)) + return cls(evaluators=evaluators, train_config=train_config) + + def maybe_run(self, epoch: int, policy, env_name: str, logger=None, global_step=None) -> dict: + """Called from the training loop. Runs every enabled evaluator + whose `interval` divides `epoch`. Returns a dict of {eval_name → metrics}.""" + results = {} + for ev in self.evaluators: + if not ev.enabled: + continue + if ev.interval <= 0: + continue + if epoch % ev.interval != 0: + continue + res = self._run_one(ev, policy=policy, env_name=env_name, logger=logger, global_step=global_step) + results[ev.name] = res + return results + + def run_one_by_name(self, name: str, policy, env_name: str, logger=None, global_step=None) -> EvalResult: + """Run a single named evaluator regardless of interval. Used for + the subprocess CLI entry and for standalone `puffer eval --evaluator `.""" + for ev in self.evaluators: + if ev.name == name: + return self._run_one(ev, policy=policy, env_name=env_name, logger=logger, global_step=global_step) + raise KeyError(f"No evaluator named '{name}'. Known: {[e.name for e in self.evaluators]}") + + def _run_one(self, ev: Evaluator, policy, env_name: str, logger, global_step) -> EvalResult: + if ev.mode == "subprocess": + res = self._run_subprocess(ev, env_name=env_name, global_step=global_step) + else: + res = self._run_inline(ev, policy=policy, env_name=env_name, global_step=global_step) + if logger is not None: + self._log(ev, res, logger=logger, global_step=global_step) + if hasattr(ev, "cleanup"): + ev.cleanup() + return res + + def _run_inline(self, ev: Evaluator, policy, env_name: str, global_step) -> EvalResult: + args = self._build_eval_args(ev, env_name=env_name, global_step=global_step) + + package = args.get("package", "ocean") + module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}" + env_module = importlib.import_module(module_name) + make_env = env_module.env_creator(env_name) + + vec_kwargs = ev.vec_overrides() + num_envs = int(vec_kwargs.get("num_envs", 1)) + env_kwargs_list = [args["env"] for _ in range(num_envs)] + env_creators = [make_env] * num_envs + env_args_list = [[]] * num_envs + + vec_call_kwargs = dict(vec_kwargs) + vec_call_kwargs.setdefault("num_workers", num_envs) + vec_call_kwargs.setdefault("batch_size", num_envs) + + vecenv = pufferlib.vector.make( + env_creators, env_args=env_args_list, env_kwargs=env_kwargs_list, **vec_call_kwargs + ) + try: + res = ev.rollout(vecenv, policy, args) + finally: + vecenv.close() + return res + + def _run_subprocess(self, ev: Evaluator, env_name: str, global_step) -> EvalResult: + out_path = Path(self.train_config.get("data_dir", ".")) / "eval_subprocess_out" / f"{ev.name}.json" + out_path.parent.mkdir(parents=True, exist_ok=True) + cfg_path = out_path.with_suffix(".cfg.json") + with open(cfg_path, "w") as f: + json.dump({"name": ev.name, "global_step": global_step}, f) + + cmd = [ + sys.executable, + "-m", + "pufferlib.pufferl", + "eval", + env_name, + "--evaluator", + ev.name, + "--out", + str(out_path), + ] + # Subprocess inherits the same checkpoint via train_config.load_model_path. + if self.train_config.get("load_model_path"): + cmd += ["--load-model-path", self.train_config["load_model_path"]] + subprocess.run(cmd, check=True) + with open(out_path) as f: + payload = json.load(f) + return EvalResult(metrics=payload.get("metrics", {}), frames=payload.get("frames", [])) + + def _build_eval_args(self, ev: Evaluator, env_name: str, global_step) -> dict: + args = copy.deepcopy(self.train_config) + args["env"].update(ev.env_overrides()) + args.setdefault("vec", {}) + args["vec"].update(ev.vec_overrides()) + args["env_name"] = env_name + args["global_step"] = global_step + args["seed"] = int(self.train_config.get("train", {}).get("seed", 42)) or 42 + # Pass through evaluator-private fields that subclasses look up on args. + ev_eval = ev.config.get("eval", {}) + if ev_eval: + args.setdefault("eval", {}) + args["eval"].update(ev_eval) + return args + + def _log(self, ev: Evaluator, result: EvalResult, logger, global_step): + if not result.metrics and not result.frames: + return + log_dict = {f"{ev.name}/{k}": float(v) for k, v in result.metrics.items() if isinstance(v, (int, float))} + if hasattr(logger, "local_writer") and logger.local_writer and global_step is not None: + for k, v in log_dict.items(): + logger.local_writer.add_scalar(k, v, global_step) + if hasattr(logger, "log") and log_dict: + if global_step is not None: + logger.log(log_dict, global_step) + else: + logger.log(log_dict) + if result.frames and hasattr(logger, "log"): + try: + import wandb + + videos = [ + wandb.Video(str(p), fps=30, format="mp4", caption=Path(p).stem) + for p in result.frames + if str(p).endswith(".mp4") + ] + if videos: + payload = {f"{ev.name}/render": videos if len(videos) > 1 else videos[0]} + if global_step is not None: + logger.log(payload, global_step) + else: + logger.log(payload) + except ImportError: + pass + + +def _discover_eval_sections(args: dict) -> dict: + """Pull `[eval.]` sections out of the parsed config. + + `load_config` flattens dotted section names into a nested dict. So + `[eval.foo]` becomes `args["eval"]["foo"]`. We collect every direct + child of `args["eval"]` that's itself a dict and treat it as a section.""" + eval_root = args.get("eval", {}) + if not isinstance(eval_root, dict): + return {} + sections = {} + for name, body in eval_root.items(): + if isinstance(body, dict): + sections[name] = body + return sections + + +def _build_section_config(name: str, raw: dict, all_sections: dict) -> dict: + """Resolve `inherits` chain + `clean` macro + dotted-key flattening.""" + chain = [] + current_name = name + current_raw = raw + visited = set() + while True: + if current_name in visited: + raise ValueError(f"Cyclic 'inherits' chain involving [eval.{current_name}]") + visited.add(current_name) + chain.append(current_raw) + parent_name = current_raw.get("inherits") + if parent_name is None: + break + if parent_name not in all_sections: + raise ValueError(f"[eval.{current_name}].inherits='{parent_name}' is not a known section") + current_name = parent_name + current_raw = all_sections[parent_name] + + merged = {} + for level in reversed(chain): + _deep_merge(merged, _expand_dotted(level)) + + if merged.get("clean", True): + env_section = merged.setdefault("env", {}) + for k, v in CLEAN_EVAL_OVERRIDES.items(): + env_section.setdefault(k, v) + + return merged + + +def _expand_dotted(raw: dict) -> dict: + """`{"env.simulation_mode": "replay"}` → `{"env": {"simulation_mode": "replay"}}`.""" + out = {} + for k, v in raw.items(): + if "." in k: + head, _, tail = k.partition(".") + sub = out.setdefault(head, {}) + sub[tail] = v + else: + out[k] = v + return out + + +def _deep_merge(dst: dict, src: dict): + for k, v in src.items(): + if isinstance(v, dict) and isinstance(dst.get(k), dict): + _deep_merge(dst[k], v) + else: + dst[k] = v diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py new file mode 100644 index 000000000..064463cc7 --- /dev/null +++ b/tests/test_eval_manager.py @@ -0,0 +1,122 @@ +"""Smoke tests for EvalManager config parsing. + +Doesn't load the full pufferl.py module (which pulls heavy training deps). +Just verifies the inheritance + clean macro + dotted-key expansion logic +behaves as the design doc says. +""" + +import os +import sys + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from pufferlib.ocean.benchmark.manager import ( + CLEAN_EVAL_OVERRIDES, + EvalManager, + _build_section_config, + _expand_dotted, +) + + +def test_dotted_expand(): + raw = {"env.simulation_mode": "replay", "interval": 25} + out = _expand_dotted(raw) + assert out == {"env": {"simulation_mode": "replay"}, "interval": 25} + + +def test_inheritance_chain(): + sections = { + "behaviors_defaults": { + "type": "behavior_class", + "interval": 250, + "env.simulation_mode": "replay", + "env.scenario_length": 201, + }, + "behaviors_hard_stop": { + "inherits": "behaviors_defaults", + "env.map_dir": "/tmp/hard_stop", + }, + } + cfg = _build_section_config("behaviors_hard_stop", sections["behaviors_hard_stop"], sections) + assert cfg["type"] == "behavior_class" + assert cfg["interval"] == 250 + assert cfg["env"]["simulation_mode"] == "replay" + assert cfg["env"]["scenario_length"] == 201 + assert cfg["env"]["map_dir"] == "/tmp/hard_stop" + + +def test_inheritance_child_wins(): + sections = { + "parent": {"interval": 250, "env.scenario_length": 201}, + "child": {"inherits": "parent", "interval": 100, "env.scenario_length": 91}, + } + cfg = _build_section_config("child", sections["child"], sections) + assert cfg["interval"] == 100 + assert cfg["env"]["scenario_length"] == 91 + + +def test_inheritance_cycle_detected(): + sections = { + "a": {"inherits": "b"}, + "b": {"inherits": "a"}, + } + with pytest.raises(ValueError, match="Cyclic"): + _build_section_config("a", sections["a"], sections) + + +def test_inheritance_unknown_parent(): + sections = { + "child": {"inherits": "nonexistent"}, + } + with pytest.raises(ValueError, match="not a known section"): + _build_section_config("child", sections["child"], sections) + + +def test_clean_macro_applied_by_default(): + sections = {"foo": {"type": "multi_scenario"}} + cfg = _build_section_config("foo", sections["foo"], sections) + for k, v in CLEAN_EVAL_OVERRIDES.items(): + assert cfg["env"][k] == v + + +def test_clean_macro_disabled_when_clean_false(): + sections = {"foo": {"type": "multi_scenario", "clean": False}} + cfg = _build_section_config("foo", sections["foo"], sections) + for k in CLEAN_EVAL_OVERRIDES: + assert k not in cfg.get("env", {}) + + +def test_clean_macro_loses_to_explicit_override(): + sections = { + "foo": { + "type": "multi_scenario", + "env.lane_segment_dropout": 0.5, # explicit > macro default of 0.0 + } + } + cfg = _build_section_config("foo", sections["foo"], sections) + assert cfg["env"]["lane_segment_dropout"] == 0.5 + + +def test_manager_from_config_skips_template_sections(): + train_config = { + "eval": { + "behaviors_defaults": {"interval": 250, "env.scenario_length": 201}, + "behaviors_hard_stop": { + "type": "behavior_class", + "inherits": "behaviors_defaults", + "env.map_dir": "/tmp/hard_stop", + }, + }, + } + mgr = EvalManager.from_config(train_config) + names = [e.name for e in mgr.evaluators] + assert "behaviors_hard_stop" in names + assert "behaviors_defaults" not in names # template, no `type` field + + +def test_manager_unknown_type_raises(): + train_config = {"eval": {"foo": {"type": "totally_made_up"}}} + with pytest.raises(ValueError, match="not registered"): + EvalManager.from_config(train_config) From 7218f5c11e3fe700f9022502e7d2daecaa973224 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:30:09 -0400 Subject: [PATCH 02/26] [WIP] eval: replace [eval] with [eval.] sections in drive.ini MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-discovered evaluator sections, schema A (dotted keys). Each behavior class is its own section inheriting from behaviors_defaults template; gigaflow validation has separate metric-only and render sections at different intervals. driving_behaviours_eval.ini deleted — folded into drive.ini. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 221 +++++++++++++----- .../config/ocean/driving_behaviours_eval.ini | 64 ----- 2 files changed, 160 insertions(+), 125 deletions(-) delete mode 100644 pufferlib/config/ocean/driving_behaviours_eval.ini diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index bafc523b2..d6d62a025 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -190,67 +190,166 @@ show_human_logs = True ; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None render_map = none -[eval] -; Set to True to enable periodic multi-scenario evaluation during training -multi_scenario_eval = False -; Set to True to enable periodic multi-scenario render during training (one -; rollout per scenario, output mp4 per scenario via the EGL render pipeline -; or HTML replay via viz.generate_interactive_replay depending on -; multi_scenario_render_backend). Does not affect multi_scenario_eval. -multi_scenario_render = True -; Epoch interval between render runs. Independent of eval_interval so metric -; eval can run on a tighter schedule than the more expensive render. -multi_scenario_render_interval = 250 -; Render backend for multi_scenario_render: "html" (CPU, viz.generate_interactive_replay) -; or "egl" (C-side render.h → EGL → PBO → ffmpeg libx264, one mp4 per scenario). -multi_scenario_render_backend = egl -; Frequency of evaluation during training (in epochs) -eval_interval = 25 -; When True, inline eval zeroes road-segment dropout + perturbations and -; enforces red-light stops. Metrics then reflect performance under clean -; conditions. The live training policy is re-aligned to the clean env's -; obs shape via _swap_policy_obs_counts — safe because the GigaFlow -; encoder is count-invariant (shared MLP + max-pool over segments). -clean_eval = True -num_agents = 512 -; Batch size for eval_multi_scenarios (number of scenarios per batch) -; Path to dataset used for evaluation -map_dir = "pufferlib/resources/drive/binaries/carla_py123d" -; Simulation mode for evaluation: "gigaflow" or "replay" -; gigaflow — procedurally spawn agents on CARLA towns (needs map-only .bin -; files in pufferlib/resources/drive/binaries/carla_py123d) -; replay — play logged trajectories from WOMD/nuPlan scenarios (needs -; trajectory-bearing .bin files in pufferlib/resources/drive/binaries/womd) -multi_scenario_simulation_mode = "gigaflow" -; Total number of scenarios to evaluate -multi_scenario_num_scenarios = 250 -; Per-scenario step count for replay-mode eval (also used as resample_frequency). -; 91 = WOMD (9.1s @ 10Hz). 201 = nuPlan (20.1s @ 10Hz). Ignored for gigaflow -; mode, which always uses a hardcoded 3000-step procedural episode. -scenario_length = 201 -; Cap the render rollout at this many steps. -render_max_steps = 201 -backend = PufferEnv -; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings -; If True, enables evaluation on realism metrics each time we save a checkpoint -wosac_realism_eval = False -wosac_num_rollouts = 32 ; Number of policy rollouts per scene -wosac_init_steps = 10 ; When to start the simulation -wosac_num_agents = 256 ; Total number of WOSAC agents to evaluate -wosac_control_mode = "control_wosac" ; Control the tracks to predict -wosac_init_mode = "create_all_valid" ; Initialize from the tracks to predict -wosac_goal_radius = 2.0 ; Can shrink goal radius for WOSAC evaluation -wosac_sanity_check = False -wosac_aggregate_results = True ; Only return aggregate results across all scenes -; If True, enable human replay evaluation (pair policy-controlled agent with human replays) -human_replay_eval = False -human_replay_control_mode = "control_sdc_only" ; Control only the self-driving car -human_replay_num_agents = 64 ; This equals the number of scenarios, since we control one agent in each -; Evaluating different driving behaviours learned by the policy -driving_behaviours_eval = True -driving_behaviours_eval_config = "pufferlib/config/ocean/driving_behaviours_eval.ini" -driving_behaviours_eval_interval = 250 -render_driving_behaviours = True +; =========================================================================== +; Evaluation suites +; +; Each [eval.] section is one Evaluator instance. EvalManager discovers +; them via auto-discovery (any section under [eval] with a `type` field). +; Sections without a `type` field are templates — referenced from other +; sections via `inherits = ""`. +; +; Field reference: +; type — registered evaluator class (multi_scenario, behavior_class, +; human_replay, wosac) +; enabled — true|false +; interval — epochs between runs (0 disables) +; mode — "inline" (block training) | "subprocess" (spawn process) +; inherits — pull defaults from another section, recursively +; clean — true → zero perturbations + dropout + enforce red lights +; render — true → capture mp4(s) during rollout +; render_views — list of camera views: sim_state, bev, topdown_sim, bev_all +; env. — any [env] override (dotted key) +; eval. — evaluator-specific knob (e.g. num_scenarios) +; vec. — any [vec] override +; =========================================================================== + +[eval.validation_gigaflow] +type = "multi_scenario" +enabled = true +interval = 25 +mode = "inline" +clean = true +render = false +env.simulation_mode = "gigaflow" +env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d" +env.num_maps = 8 +env.num_agents = 512 +env.min_agents_per_env = 50 +env.max_agents_per_env = 50 +env.scenario_length = 3000 +env.resample_frequency = 3000 +eval.num_scenarios = 250 + +[eval.validation_gigaflow_render] +inherits = "validation_gigaflow" +enabled = true +interval = 250 +render = true +render_views = ["sim_state", "bev"] +eval.num_scenarios = 5 + +; --------------------------------------------------------------------------- +; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each +; behavior is one [eval.behaviors_*] section. All inherit from the template +; below — change shared knobs in one place. +; --------------------------------------------------------------------------- + +[eval.behaviors_defaults] +; Template — no `type`, never instantiated directly. Other sections inherit. +enabled = false +interval = 250 +mode = "inline" +clean = true +render = true +render_views = ["sim_state", "bev"] +env.simulation_mode = "replay" +env.control_mode = "control_sdc_only" +env.init_mode = "create_all_valid" +env.scenario_length = 201 +env.max_partner_observations = 32 +eval.num_scenarios = 50 + +[eval.behaviors_full_dir] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/nuplan_mini_train_bins" + +[eval.behaviors_hard_stop] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/hard_stop" + +[eval.behaviors_highway_straight] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/highway_straight" + +[eval.behaviors_lane_change] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/lane_change" + +[eval.behaviors_merge] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/merge" + +[eval.behaviors_parked_cars] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/parked_cars" + +[eval.behaviors_roundabout] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/roundabout" + +[eval.behaviors_stopped_traffic] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/stopped_traffic" + +[eval.behaviors_traffic_light_green] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_green" + +[eval.behaviors_traffic_light_stop] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_stop" + +[eval.behaviors_unprotected_left] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_left" + +[eval.behaviors_unprotected_right] +inherits = "behaviors_defaults" +type = "behavior_class" +enabled = true +env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_right" + +; --------------------------------------------------------------------------- +; Optional: WOSAC realism eval. Off by default. +; --------------------------------------------------------------------------- + +[eval.wosac] +type = "wosac" +enabled = false +interval = 500 +mode = "subprocess" +clean = true +render = false +env.control_mode = "control_wosac" +env.init_mode = "create_all_valid" +env.init_steps = 10 +env.goal_radius = 2.0 +eval.wosac_num_rollouts = 32 +eval.wosac_num_agents = 256 +eval.wosac_sanity_check = false +eval.wosac_aggregate_results = true ; [sweep.train.learning_rate] ; distribution = log_normal diff --git a/pufferlib/config/ocean/driving_behaviours_eval.ini b/pufferlib/config/ocean/driving_behaviours_eval.ini deleted file mode 100644 index 02896db99..000000000 --- a/pufferlib/config/ocean/driving_behaviours_eval.ini +++ /dev/null @@ -1,64 +0,0 @@ -; Configuration for driving behaviour evaluation using nuPlan mini-train -; scenarios labeled by scene type. Built from py123d 0.2.1 reconvert of -; .bin files under /scratch/ev2237/data/nuplan/categories_v021/. -; -; Eval runs in REPLAY mode (simulation_mode=replay, control_mode=control_sdc_only) -; using the same reward weights as training (no reward conditioning). Scenario -; length is 201 (nuPlan with duration_s=20 at 10Hz → 20.1s). -; -; Categories with an empty folder are omitted — driving_behaviours_eval errors -; if map_dir has no .bin files. Add new categories by labeling more scenes -; (see scripts/render_scenario.py --view bev) and copying them into the -; corresponding /scratch/ev2237/data/nuplan/categories_v021// folder. - -[eval_full_dir] -map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins" -scenario_length = 201 -; Random-sample this many bins from map_dir each eval pass (fresh sample -; per pass). Cap keeps wall-clock manageable; 876-bin full sweep would -; take ~25 min, 50 bins takes ~1.5 min. -num_scenarios = 50 - -[eval_hard_stop] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop" -scenario_length = 201 - -[eval_highway_straight] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight" -scenario_length = 201 - -[eval_lane_change] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change" -scenario_length = 201 - -[eval_merge] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge" -scenario_length = 201 - -[eval_parked_cars] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars" -scenario_length = 201 - -[eval_roundabout] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout" -scenario_length = 201 - -[eval_stopped_traffic] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic" -scenario_length = 201 - -[eval_traffic_light_green] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green" -scenario_length = 201 - -[eval_traffic_light_stop] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop" -scenario_length = 201 - -[eval_unprotected_left] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left" -scenario_length = 201 - -[eval_unprotected_right] -map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right" -scenario_length = 201 From 0360a83f2429f5a6f7930e663fcb3249679aaa83 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:30:23 -0400 Subject: [PATCH 03/26] [WIP] eval: rip out legacy eval functions, wire EvalManager into PuffeRL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed from pufferl.py (~990 lines): eval(), eval_multi_scenarios(), eval_multi_scenarios_render(), build_eval_overrides(), load_eval_multi_scenarios_config(), _swap_policy_obs_counts(), _render_driving_behaviours(), _export_metrics(), _log_eval_metrics(), verify_scenario_coverage(), verify_scenario_coverage_gigaflow(). Plus the legacy eval block in PuffeRL.evaluate() and the driving_behaviours_eval.ini loader in load_config. Removed from utils.py (~300 lines): run_human_replay_eval_in_subprocess, run_wosac_eval_in_subprocess, run_driving_behaviours_eval_in_subprocess. PuffeRL.evaluate() now calls self._eval_manager.maybe_run() — single unified path for all evals. main() wires `puffer eval --evaluator --out ` for both standalone and subprocess use. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/pufferl.py | 1355 +++--------------------------------------- pufferlib/utils.py | 303 ---------- 2 files changed, 80 insertions(+), 1578 deletions(-) diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index d501a2bc4..7176c9759 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -7,8 +7,6 @@ import numbers import warnings -import pandas as pd - warnings.filterwarnings("error", category=RuntimeWarning) @@ -257,6 +255,8 @@ def __init__(self, config, vecenv, policy, logger=None): self.losses = {} self.best_score = -float("inf") self.ema_max = 0.0 + # Set later via PuffeRL.attach_eval_manager (before evaluate() fires). + self._eval_manager = None # Dashboard self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad) @@ -457,198 +457,18 @@ def train(self): except Exception as e: print(f"Failed to export model weights: {e}") - if self.config["eval"]["wosac_realism_eval"] and ( - self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training - ): - pufferlib.utils.run_wosac_eval_in_subprocess(self.config, self.logger, self.global_step) - - if self.config["eval"]["human_replay_eval"] and ( - self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training - ): - pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step) - - if self.config["eval"]["wosac_realism_eval"] and ( - self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training - ): - pufferlib.utils.run_wosac_eval_in_subprocess(self.config, self.logger, self.global_step) - - if self.config["eval"]["human_replay_eval"] and ( - self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training - ): - pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step) - - behaviours_eval_enabled = self.config["eval"].get("driving_behaviours_eval", False) - behaviours_eval_interval = int(self.config["eval"].get("driving_behaviours_eval_interval", 25)) - behaviours_config = self.config.get("driving_behaviours_eval") - if ( - behaviours_eval_enabled - and behaviours_config - and behaviours_eval_interval > 0 - and (self.epoch % behaviours_eval_interval == 0 or done_training) - ): - self.save_checkpoint() - pufferlib.utils.run_driving_behaviours_eval_in_subprocess( - self.config, self.logger, self.global_step, behaviours_config - ) - if self.config["eval"].get("render_driving_behaviours"): - self._render_driving_behaviours(behaviours_config) - - if self.config["eval"]["multi_scenario_eval"] and ( - self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training - ): - # Get evaluation settings from config - eval_simulation_mode = self.config["eval"]["multi_scenario_simulation_mode"] - num_agents_eval = self.config["eval"]["num_agents"] - map_dir = self.config["eval"]["map_dir"] - - # Inline eval runs "clean" by default — perturbations + dropout off, - # red-light stops enforced — so the logged validation metrics - # track progress under controlled conditions rather than noisy - # training perturbations. The live training policy's road slicing - # is re-aligned to the clean env at eval time via - # _swap_policy_obs_counts inside eval_multi_scenarios. - clean_eval = self.config["eval"].get("clean_eval", True) - eval_overrides = build_eval_overrides( - simulation_mode=eval_simulation_mode, - num_agents=num_agents_eval, - num_scenarios=self.config["eval"]["multi_scenario_num_scenarios"], - map_dir=map_dir, - num_carla_maps=self.config["eval"].get("num_carla_maps", 8), - clean=clean_eval, - scenario_length=self.config["eval"].get("scenario_length"), - ) - - # Build eval args by applying overrides to training config - eval_args = load_eval_multi_scenarios_config( + # All evaluation is now driven by the unified EvalManager. Each + # [eval.] section in drive.ini is one evaluator instance; + # the manager fires any whose interval divides this epoch. See + # docs/eval_unification.md for the design. + if self._eval_manager is not None: + self._eval_manager.maybe_run( + epoch=self.epoch, + policy=self.uncompiled_policy, env_name=self.config["env"], - model_path=None, # No saved model - using current policy in memory - eval_overrides=eval_overrides, - ) - # Add inline-specific settings - eval_args["global_step"] = self.global_step # Log by global step for TensorBoard - eval_args["num_scenarios"] = self.config["eval"]["multi_scenario_num_scenarios"] - eval_args["eval_simulation"] = eval_simulation_mode - - # Mark this as inline evaluation and set results folder in experiments - eval_args["inline_eval"] = True # Flag to indicate inline evaluation during training - experiment_name = f"{self.config['env']}_{self.logger.run_id}" - eval_args["load_model_path"] = os.path.join( - self.config["data_dir"], experiment_name, "models", f"inline_epoch_{self.epoch}.pt" + logger=self.logger, + global_step=self.global_step, ) - # For inline eval, results go in experiments folder instead of benchmark - eval_args["eval_results_dir"] = os.path.join( - self.config["data_dir"], - experiment_name, - "validation", - f"epoch_{self.epoch}", - self.config["eval"]["multi_scenario_simulation_mode"], - ) - - # Call eval_multi_scenarios inline with current policy and logger - print(f"\n🔄 Running multi-scenario evaluation at step {self.global_step}...") - eval_multi_scenarios( - env_name=self.config["env"], - args=eval_args, - vecenv=None, # Let it create its own eval environment - policy=self.uncompiled_policy, # Pass current policy - logger=self.logger, # Pass logger for TensorBoard logging - metric_prefix="validation", # Use validation_ prefix - quiet=True, # Suppress verbose output during inline eval - clean=clean_eval, - ) - - # Multi-scenario render — independent interval so the heavier render - # path doesn't have to fire every eval_interval. Mirrors the block - # above but calls eval_multi_scenarios_render with render=True and - # the configured backend ("egl" by default on this branch, writes - # one mp4 per scenario via the C render.h pipeline). - if self.config["eval"]["multi_scenario_render"] and ( - self.epoch % self.config["eval"]["multi_scenario_render_interval"] == 0 or done_training - ): - render_simulation_mode = self.config["eval"]["multi_scenario_simulation_mode"] - num_agents_render = self.config["eval"]["num_agents"] - render_map_dir = self.config["eval"]["map_dir"] - clean_render = self.config["eval"].get("clean_eval", True) - - render_overrides = build_eval_overrides( - simulation_mode=render_simulation_mode, - num_agents=num_agents_render, - num_scenarios=self.config["eval"]["multi_scenario_num_scenarios"], - map_dir=render_map_dir, - num_carla_maps=self.config["eval"].get("num_carla_maps", 8), - clean=clean_render, - scenario_length=self.config["eval"].get("scenario_length"), - ) - - render_args = load_eval_multi_scenarios_config( - env_name=self.config["env"], - model_path=None, - eval_overrides=render_overrides, - ) - render_args["global_step"] = self.global_step - render_args["num_scenarios"] = self.config["eval"]["multi_scenario_num_scenarios"] - render_args["eval_simulation"] = render_simulation_mode - render_args["render"] = True # master on/off for the render branch - render_args["render_obs"] = False # HTML-only; EGL path ignores - - render_args["inline_eval"] = True - experiment_name = f"{self.config['env']}_{self.logger.run_id}" - render_args["load_model_path"] = os.path.join( - self.config["data_dir"], experiment_name, "models", f"inline_epoch_{self.epoch}.pt" - ) - render_args["eval_results_dir"] = os.path.join( - self.config["data_dir"], - experiment_name, - "renders", - f"epoch_{self.epoch:08d}", - self.config["eval"]["multi_scenario_simulation_mode"], - ) - - backend_name = self.config["eval"]["multi_scenario_render_backend"] - print(f"\n🎬 Running multi-scenario {backend_name} render at step {self.global_step}...") - # Render failures (missing map dir, corrupted .bin files, ffmpeg - # absent, EGL unavailable, etc.) should NEVER crash training — the - # render is a logging side-channel. Catch any exception here, log - # it, and let training keep going. The upstream eval_multi_scenarios - # metric call is separate and already ran, so metric eval continues - # to work even if video rendering is broken. - # Multi-view EGL render: run the full render fn once per view - # (sim_state then bev). Each call creates a fresh vecenv that - # starts at scenario 0, runs all scenarios with one camera, and - # tears down. Doing both views in ONE rollout would not work - # because Drive.step's resample fires at the last step and - # advances starting_map_counter — a re-reset would replay the - # NEXT batch instead of the original one. - _bev_views = [(0, "", "sim_state"), (1, "_bev", "bev")] if backend_name == "egl" else [(0, "", "sim_state")] - for _vmode, _vsuffix, _vlabel in _bev_views: - try: - eval_multi_scenarios_render( - env_name=self.config["env"], - args=dict(render_args), - vecenv=None, - policy=self.uncompiled_policy, - logger=self.logger, - metric_prefix=f"render_{_vlabel}", - quiet=True, - render_backend=backend_name, - view_mode=_vmode, - video_suffix=_vsuffix, - log_view_label=_vlabel, - # Configurable cap: eval.render_max_steps. Default 50 until - # the mystery ~500-c_render-call abort is properly diagnosed. - # Set to 0/negative to disable the cap entirely. - render_max_steps=(self.config["eval"].get("render_max_steps", 50) or None), - clean=clean_render, - ) - except Exception as e: - import traceback - - print( - f"\n⚠️ multi_scenario_render failed (view={_vlabel}) at step {self.global_step}: " - f"{type(e).__name__}: {e}" - ) - traceback.print_exc() - print("Training continues.") return logs @@ -959,90 +779,6 @@ def mean_and_log(self): self.logger.log(logs, agent_steps) return logs - def _render_driving_behaviours(self, behaviours_config): - """Render one scenario per driving behaviour class using eval_multi_scenarios_render.""" - import random as _random - - EVAL_SECTIONS_PREFIX = "eval_" - backend_name = self.config["eval"].get("multi_scenario_render_backend", "egl") - bev_views = [(0, "", "sim_state"), (1, "_bev", "bev")] if backend_name == "egl" else [(0, "", "sim_state")] - - for class_name, class_cfg in behaviours_config.items(): - if not class_name.startswith(EVAL_SECTIONS_PREFIX): - continue - map_dir = class_cfg.get("map_dir", "") - if isinstance(map_dir, str): - map_dir = map_dir.strip('"').strip("'") - if not os.path.isdir(map_dir) or not any(f.endswith(".bin") for f in os.listdir(map_dir)): - continue - - short = class_name[len(EVAL_SECTIONS_PREFIX) :] - num_maps = len([f for f in os.listdir(map_dir) if f.endswith(".bin")]) - # Render under clean-eval conditions (zero dropout, zero - # perturbations, enforced red lights) so the mp4s show what - # the policy does under controlled eval, not the noisy - # training-time perturbations. Matches run_driving_behaviours - # _eval_in_subprocess, so the video matches the metric eval. - render_overrides = build_eval_overrides( - simulation_mode="replay", - num_agents=1, - num_scenarios=1, - map_dir=map_dir, - clean=True, - ) - render_overrides["env"]["control_mode"] = "control_sdc_only" - render_overrides["env"]["num_maps"] = num_maps - render_overrides["env"]["scenario_length"] = class_cfg.get("scenario_length", 91) - # Pick a random starting map index so each render epoch shows a - # different scenario from the directory. Without this, the env - # picks scenario 0 every time and we'd always render the same - # first .bin alphabetically. - render_overrides["env"]["starting_map"] = _random.randint(0, num_maps - 1) - - render_args = load_eval_multi_scenarios_config( - env_name=self.config["env"], - model_path=None, - eval_overrides=render_overrides, - ) - experiment_name = f"{self.config['env']}_{self.logger.run_id}" - render_args["global_step"] = self.global_step - render_args["num_scenarios"] = 1 - render_args["eval_simulation"] = "replay" - render_args["render"] = True - render_args["inline_eval"] = True - render_args["eval_results_dir"] = os.path.join( - self.config["data_dir"], - experiment_name, - "renders", - f"epoch_{self.epoch:08d}", - "driving_behaviours", - short, - ) - - for vmode, vsuffix, vlabel in bev_views: - try: - eval_multi_scenarios_render( - env_name=self.config["env"], - args=dict(render_args), - vecenv=None, - policy=self.uncompiled_policy, - logger=self.logger, - metric_prefix=f"driving_behaviours/{short}", - render_key_prefix=f"driving_behaviours/{short}/render/{vlabel}", - quiet=True, - render_backend=backend_name, - view_mode=vmode, - video_suffix=vsuffix, - log_view_label=vlabel, - render_max_steps=(self.config["eval"].get("render_max_steps", 50) or None), - clean=True, - ) - except Exception as e: - import traceback - - print(f"DrivingBehavioursRender [{short}] view={vlabel}: {type(e).__name__}: {e}") - traceback.print_exc() - def close(self): self.vecenv.close() self.utilization.stop() @@ -1635,10 +1371,13 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, early_stop **args["train"], env=env_name, eval=args.get("eval", {}), - driving_behaviours_eval=args.get("driving_behaviours_eval"), ) pufferl = PuffeRL(train_config, vecenv, policy, logger) + from pufferlib.ocean.benchmark.manager import EvalManager + + pufferl._eval_manager = EvalManager.from_config(args) + # Restore optimizer state + step counters when resuming from a checkpoint. # save_checkpoint writes models/model__.pt and trainer_state.pt # (sibling of models/) — so trainer_state.pt is one dir above the .pt path. @@ -1723,994 +1462,66 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, early_stop return all_logs -def eval(env_name, args=None, vecenv=None, policy=None): - """Evaluate a policy.""" - - args = args or load_config(env_name) - args["env"]["termination_mode"] = 0 - - wosac_enabled = args["eval"]["wosac_realism_eval"] - human_replay_enabled = args["eval"]["human_replay_eval"] - - if wosac_enabled: - args["env"]["map_dir"] = args["eval"]["map_dir"] - dataset_name = args["env"]["map_dir"].split("/")[-1] - - print(f"Running WOSAC realism evaluation with {dataset_name} dataset.\n") - from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator - - backend = args["eval"]["backend"] - assert backend == "PufferEnv" or not wosac_enabled, "WOSAC evaluation only supports PufferEnv backend." - - # Configure environment for WOSAC - args["vec"] = dict(backend=backend, num_envs=1) - args["env"]["init_mode"] = args["eval"]["wosac_init_mode"] - args["env"]["control_mode"] = args["eval"]["wosac_control_mode"] - args["env"]["init_steps"] = args["eval"]["wosac_init_steps"] - args["env"]["goal_behavior"] = args["eval"]["wosac_goal_behavior"] - args["env"]["goal_radius"] = args["eval"]["wosac_goal_radius"] - - # Batch size configuration - num_scenes_per_batch = args["eval"]["wosac_batch_size"] - args["env"]["num_agents"] = num_scenes_per_batch * 10 - args["env"]["num_maps"] = args["eval"]["wosac_scenario_pool_size"] - - # Create environment and policy - vecenv = vecenv or load_env(env_name, args) - policy = policy or load_policy(args, vecenv, env_name) - - # Make eval class instance - evaluator = WOSACEvaluator(args) - - # Obtain scores - df_results = evaluator.evaluate(args, vecenv, policy) - - # Average results over scenarios - results_dict = df_results.mean().to_dict() - results_dict["total_num_agents"] = df_results["num_agents_per_scene"].sum() - results_dict["total_unique_scenarios"] = df_results.index.unique().shape[0] - results_dict["realism_meta_score_std"] = df_results["realism_meta_score"].std() - results_dict = {k: v.item() if hasattr(v, "item") else v for k, v in results_dict.items()} - - import json - - print("\nWOSAC_METRICS_START") - print(json.dumps(results_dict)) - print("WOSAC_METRICS_END") - vecenv.close() - return results_dict - - elif human_replay_enabled: - args["env"]["map_dir"] = args["eval"]["map_dir"] - dataset_name = args["env"]["map_dir"].split("/")[-1] - print(f"Running human replay evaluation with {dataset_name} dataset.\n") - from pufferlib.ocean.benchmark.evaluator import HumanReplayEvaluator - - backend = args["eval"].get("backend", "PufferEnv") - args["env"]["map_dir"] = args["eval"]["map_dir"] - args["env"]["num_agents"] = args["eval"]["human_replay_num_agents"] - args["env"]["num_maps"] = len([f for f in os.listdir(args["env"]["map_dir"]) if f.endswith(".bin")]) - - args["vec"] = dict(backend=backend, num_envs=1) - args["env"]["control_mode"] = args["eval"]["human_replay_control_mode"] - args["env"]["scenario_length"] = args["eval"].get("scenario_length", 201) - - vecenv = vecenv or load_env(env_name, args) - policy = policy or load_policy(args, vecenv, env_name) - - print(f"Effective number of scenarios used: {len(vecenv.driver_env.agent_offsets) - 1}") - - evaluator = HumanReplayEvaluator(args) - - # Run rollouts with human replays - results = evaluator.rollout(args, vecenv, policy) - - import json - - print("HUMAN_REPLAY_METRICS_START") - print(json.dumps(results)) - print("HUMAN_REPLAY_METRICS_END") - - return results - - else: # Standard evaluation: Render - backend = args["vec"]["backend"] - if backend != "PufferEnv": - backend = "Serial" - - args["vec"] = dict(backend=backend, num_envs=1) - vecenv = vecenv or load_env(env_name, args) - policy = policy or load_policy(args, vecenv, env_name) - - ob, info = vecenv.reset() - driver = vecenv.driver_env - num_agents = vecenv.observation_space.shape[0] - device = args["train"]["device"] - - state = {} - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), - ) - - frames = [] - while True: - render = driver.render() - if len(frames) < args["save_frames"]: - frames.append(render) - - # Screenshot Ocean envs with F12, gifs with control + F12 - if driver.render_mode == "ansi": - print("\033[0;0H" + render + "\n") - time.sleep(1 / args["fps"]) - elif driver.render_mode == "rgb_array": - pass - # import cv2 - # render = cv2.cvtColor(render, cv2.COLOR_RGB2BGR) - # cv2.imshow('frame', render) - # cv2.waitKey(1) - # time.sleep(1/args['fps']) - - with torch.no_grad(): - ob = torch.as_tensor(ob).to(device) - logits, value = policy.forward_eval(ob, state) - action, logprob, _ = pufferlib.pytorch.sample_logits(logits) - action = action.cpu().numpy().reshape(vecenv.action_space.shape) - - if isinstance(logits, torch.distributions.Normal): - action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) - - ob = vecenv.step(action)[0] - - if len(frames) > 0 and len(frames) == args["save_frames"]: - import imageio - - imageio.mimsave(args["gif_path"], frames, fps=args["fps"], loop=0) - frames.append("Done") - - -def load_eval_multi_scenarios_config(env_name, model_path=None, eval_overrides=None): - """Load config for evaluation, merging experiment YAML with defaults.""" - args = load_config(env_name) - if model_path: - experiment_dir = os.path.dirname(os.path.dirname(model_path)) - config_yaml_path = os.path.join(experiment_dir, "config.yaml") - EXCLUDE_KEYS = eval_overrides["env"].keys() - # Override Policy and RNN dimensions from training config - if os.path.exists(config_yaml_path): - print(f"Found config.yaml at {config_yaml_path}. Merging with defaults...") - with open(config_yaml_path, "r") as f: - yaml_config = yaml.safe_load(f) - - for section in ["env", "policy", "rnn"]: - if section in yaml_config and isinstance(yaml_config[section], dict): - for k, v in yaml_config[section].items(): - if k not in EXCLUDE_KEYS: - args[section][k] = v - - # Also copy root-level keys like rnn_name, policy_name - for key in ["rnn_name", "policy_name"]: - if key in yaml_config: - args[key] = yaml_config[key] - - # Update use_rnn based on rnn_name - args["train"]["use_rnn"] = args["rnn_name"] is not None - - # Override env parameters from evaluation config - if eval_overrides: - for section, section_overrides in eval_overrides.items(): - if isinstance(section_overrides, dict): - for k, v in section_overrides.items(): - args[section][k] = v - else: - args[section] = section_overrides - - return args - - -def build_eval_overrides( - simulation_mode, num_agents, num_scenarios, map_dir=None, num_carla_maps=8, clean=False, scenario_length=None -): - """Build evaluation overrides for a given simulation mode. - - Args: - simulation_mode: "gigaflow" or "replay" - num_agents: agent slot budget for evaluation - map_dir: replay dataset directory, required for replay mode - clean: if True, run a "clean" eval — zero road-segment dropout and - enforce red-light stops. Only safe when the policy is rebuilt - from the eval env (standalone eval / render_scenario.py). Inline - eval during training reuses the live training policy, whose - encoder was built for the training obs shape; zeroing dropout - there changes the obs shape and triggers a CUDA device-side - assert. Perturbation probabilities (partner_blindness, - phantom_braking) are always forced to zero at eval — they're - pure randomness, they don't change the obs shape, and eval - should be deterministic regardless of clean mode. - scenario_length: replay-mode scenarios per-step count (also used as - resample_frequency). Defaults to 91 — WOMD's 9.1s @ 10Hz. nuPlan - scenes from the categorized py123d pipeline want 201 (20.1s). - Ignored in gigaflow mode (procedural episodes always run for the - hardcoded 3000-step budget). - """ - # Common reward coefficients (same for both modes) - common_env = { - "eval_mode": 1, - "collision_behavior": 1, - "offroad_behavior": 1, - "traffic_light_behavior": 1 if clean else 0, - "reward_randomization": False, - "reward_vehicle_collision": 3.0, - "reward_offroad_collision": 3.0, - "reward_ade": 0.0, - "reward_goal": 1.0, - "reward_overspeed": 0.05, - "reward_comfort": 0.05, - "reward_velocity": 0.0025, - "reward_lane_align": 0.025, - "reward_lane_center": 0.0038, - "reward_timestep": 0.000025, - # Always zero perturbations at eval. These don't change obs shape so - # it's safe to force even for inline eval, and a deterministic eval - # is what we want for tracking progress. - "partner_blindness_prob": 0.0, - "phantom_braking_prob": 0.0, - "phantom_braking_trigger_prob": 0.0, - } - - if clean: - # Dropout changes the obs shape. Only safe when the policy is - # rebuilt from the eval env (standalone eval / render_scenario). - # NEVER pass clean=True from an inline-eval call site — the live - # training policy's encoder was built for the training obs shape. - common_env["lane_segment_dropout"] = 0.0 - common_env["boundary_segment_dropout"] = 0.0 - - if simulation_mode == "gigaflow": - eval_overrides = { - "env": { - **common_env, - "simulation_mode": "gigaflow", - "min_agents_per_env": 50, - "max_agents_per_env": 50, - "resample_frequency": 3000, - "scenario_length": 3000, - # Point at the py123d-converted CARLA towns added to this branch. - # The older binaries/carla dir predates the 123Drive pipeline and - # is not populated on emerge/temp_training. - "map_dir": map_dir or "pufferlib/resources/drive/binaries/carla_py123d", - "num_maps": num_carla_maps, - "num_agents": num_agents, - "termination_mode": 0.0, - } - } - elif simulation_mode == "replay": - replay_len = scenario_length if scenario_length is not None else 91 - eval_overrides = { - "env": { - **common_env, - "simulation_mode": "replay", - "resample_frequency": replay_len, - "scenario_length": replay_len, - "max_agents_per_env": 64, - "map_dir": map_dir or "pufferlib/resources/drive/binaries/womd", - "num_maps": num_scenarios, - "num_agents": num_agents, - "min_agents_per_env": 1, - "termination_mode": 0.0, - # "control_mode": "control_sdc_only", - }, - } - else: - raise ValueError(f"Invalid simulation_mode: {simulation_mode}. Must be 'gigaflow' or 'replay'.") - - return eval_overrides - - -@contextlib.contextmanager -def _swap_policy_obs_counts(policy, vecenv): - """Temporarily align the policy's road-segment slicing with the eval env. - - Training uses dropout > 0 → smaller obs_{lane,boundary}_segment_count. - Clean eval uses dropout = 0 → larger counts, larger obs buffer. The - GigaFlow encoder (lane_encoder / boundary_encoder) is a shared MLP - applied per-segment with max-pool — its weights are count-invariant. - Only the obs-buffer slicing in DriveBackbone.forward depends on these - counts, so we can just swap them for the duration of the eval and the - same training policy works on the larger clean obs. - """ - try: - eval_env = vecenv.driver_env - new_lane = int(eval_env.obs_lane_segment_count) - new_boundary = int(eval_env.obs_boundary_segment_count) - except AttributeError: - # If the eval env doesn't expose these (unknown wrapper), skip the - # swap — forward will still work when training and eval obs shapes - # coincide (clean=False or no dropout configured). - yield - return - - targets = [] - for m in policy.modules(): - if hasattr(m, "obs_lane_segment_count") and hasattr(m, "obs_boundary_segment_count"): - targets.append(m) - - saved = [(m.obs_lane_segment_count, m.obs_boundary_segment_count) for m in targets] - try: - for m in targets: - m.obs_lane_segment_count = new_lane - m.obs_boundary_segment_count = new_boundary - yield - finally: - for m, (orig_lane, orig_boundary) in zip(targets, saved): - m.obs_lane_segment_count = orig_lane - m.obs_boundary_segment_count = orig_boundary - - -def verify_scenario_coverage(csv_path: str, num_scenarios: int) -> dict: - """ - Verify that episode_metrics.csv contains all expected scenarios. - - Args: - csv_path: Path to episode_metrics.csv - num_scenarios: Expected number of scenarios (e.g., 1000) - - Returns: - dict with keys: - - complete: bool - True if all scenarios present - - expected_count: number of expected scenarios - - found_count: number of unique scenarios found - - missing: sorted list of missing map names - - extra: sorted list of unexpected map names - - duplicates: dict mapping map_name -> count (if >1) - """ - df = pd.read_csv(csv_path) - - # Expected: map_000, map_001, ..., map_{num_scenarios-1} - expected = {f"map_{i:03d}" for i in range(num_scenarios)} - found = set(df["map_name"].unique()) - - missing = expected - found - extra = found - expected +def eval(env_name, args=None, vecenv=None, policy=None, evaluator_name=None, out_path=None): + """Run a single named evaluator from drive.ini. - # Check for duplicates - counts = df["map_name"].value_counts() - duplicates = {name: count for name, count in counts.items() if count > 1} + Standalone form: `puffer eval puffer_drive --evaluator `. The + evaluator's config (env/vec overrides, render flag, etc.) comes from + the [eval.] section. Loads the policy from `--load-model-path`. - complete = len(missing) == 0 - - return { - "complete": complete, - "expected_count": num_scenarios, - "found_count": len(found), - "missing": sorted(missing), - "extra": sorted(extra), - "duplicates": duplicates, - } - - -def verify_scenario_coverage_gigaflow(csv_path: str, num_scenarios: int) -> dict: - """ - Verify gigaflow evaluation CSV: maps repeat across scenarios, so check total - row count rather than unique map names. + Subprocess form: `--out ` writes the result dict to a JSON file + so the parent EvalManager can read structured metrics back without + parsing stdout. """ - df = pd.read_csv(csv_path) - total_rows = len(df) - complete = total_rows == num_scenarios - return { - "complete": complete, - "expected_count": num_scenarios, - "found_count": total_rows, - } - - -# Helper functions for eval_multi_scenarios and eval_multi_scenarios_render -def _export_metrics(global_infos, eval_folder, num_scenarios, quiet, verify_coverage=False, simulation_mode="replay"): - """Export episode and summary CSVs, return avg_infos dict.""" - # Episode Metrics - try: - df_episodes = pd.DataFrame(global_infos) - first_cols = ["episode_id", "map_name"] - other_cols = [col for col in df_episodes.columns if col not in first_cols] - new_col_order = first_cols + other_cols - df_episodes = df_episodes[new_col_order] - - if verify_coverage: - df_episodes = df_episodes.sort_values(by=["map_name", "episode_id"]) - - episode_csv_path = os.path.join(eval_folder, "episode_metrics.csv") - df_episodes.to_csv(episode_csv_path, index=False) - if not quiet: - print(f"\n✅ Per-episode metrics exported to {episode_csv_path}") - - if verify_coverage: - if simulation_mode == "gigaflow": - result = verify_scenario_coverage_gigaflow(episode_csv_path, num_scenarios) - if not quiet: - if result["complete"]: - print(f"✅ All {num_scenarios} episodes present in CSV") - else: - print( - f"⚠️ Episode count mismatch: expected {result['expected_count']}, found {result['found_count']}" - ) - else: - result = verify_scenario_coverage(episode_csv_path, num_scenarios) - if not quiet: - if result["complete"]: - print(f"✅ All {num_scenarios} scenarios present in CSV") - else: - print(f"⚠️ Scenario coverage incomplete:") - print(f" Expected: {result['expected_count']}, Found: {result['found_count']}") - if result["missing"]: - print(f" Missing ({len(result['missing'])}): {result['missing']}") - if result["extra"]: - print(f" Extra: {result['extra'][:10]}...") - if result["duplicates"]: - print(f" Duplicates: {len(result['duplicates'])} scenarios have multiple entries") - for name, count in sorted(result["duplicates"].items()): - print(f" {name}: {count} entries") - except Exception as e: - print(f"\n⚠️ Could not export per-episode CSV. Error: {e}") - print("Global infos data:", global_infos) - - # Evaluation average metrics - avg_infos = {} - for k, v in global_infos.items(): - if k == "num_scenarios": - avg_infos[k] = np.sum(v) - elif v and isinstance(v[0], numbers.Number): - avg_infos[k] = np.mean(v) - df_summary = pd.DataFrame(list(avg_infos.items()), columns=["Metric", "Average"]) - summary_csv_path = os.path.join(eval_folder, "evaluation_summary.csv") - df_summary.to_csv(summary_csv_path, index=False) - if not quiet: - print(f"\n✅ Average results exported to {summary_csv_path}") - print(df_summary.to_string(index=False)) - - return avg_infos - - -def _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet): - """Log metrics to TensorBoard/wandb if logger is provided.""" - if logger is None or args.get("global_step") is None: - return - - global_step = args["global_step"] - - # Create log dict with metric prefix (use / for TensorBoard grouping) - log_dict = {} - for metric_key, metric_value in avg_infos.items(): - if isinstance(metric_value, (int, float)): - log_dict[f"{metric_prefix}/{metric_key}"] = float(metric_value) - - # Log to TensorBoard if available - if hasattr(logger, "local_writer") and logger.local_writer: - for key, value in log_dict.items(): - logger.local_writer.add_scalar(key, value, global_step) - if not quiet: - print(f"✅ Logged {len(log_dict)} validation metrics to TensorBoard at step {global_step}") - - # Also log to wandb/neptune if available - if hasattr(logger, "log"): - logger.log(log_dict, global_step) - - -def eval_multi_scenarios( - env_name, - args=None, - vecenv=None, - policy=None, - logger=None, - metric_prefix="validation", - quiet=False, - clean=False, -): - t0 = time.time() - - if args is None: - tmp_args = load_config(env_name) - model_path = tmp_args.get("load_model_path") - num_agents_eval = tmp_args["eval"]["num_agents"] - map_dir = tmp_args["eval"]["map_dir"] - - # CLI standalone entry point: read clean_eval from the eval section - # so users can enable it via --eval.clean-eval. Inline callers pass - # clean= directly and come in through the args-provided branch. - clean_from_config = tmp_args["eval"].get("clean_eval", False) - eval_overrides = build_eval_overrides( - simulation_mode=tmp_args["eval_simulation"], - num_agents=num_agents_eval, - num_scenarios=tmp_args["num_scenarios"], - map_dir=map_dir, - num_carla_maps=tmp_args.get("num_carla_maps", 8), - clean=clean_from_config, - scenario_length=tmp_args["eval"].get("scenario_length"), - ) - args = load_eval_multi_scenarios_config(env_name, model_path, eval_overrides) - clean = clean or clean_from_config - - # Reproducibility — same approach as training - seed = args["train"]["seed"] or 42 - np.random.seed(seed) - torch.manual_seed(seed) - - backend = args["vec"]["backend"] - num_scenarios = args["num_scenarios"] - - num_workers = min(args["vec"]["num_envs"], num_scenarios) - - # Distribute scenarios across workers - scenarios_per_worker = num_scenarios // num_workers - remainder = num_scenarios % num_workers - current_start = 0 - env_kwargs_list = [] - for j in range(num_workers): - worker_kwargs = copy.deepcopy(args["env"]) - worker_num_scenario = scenarios_per_worker + (1 if j < remainder else 0) - worker_kwargs["starting_map"] = current_start - worker_kwargs["num_eval_scenarios"] = worker_num_scenario - env_kwargs_list.append(worker_kwargs) - current_start += worker_num_scenario - - print(f"Distributing {num_scenarios} scenarios across {num_workers} workers:") - for j, w in enumerate(env_kwargs_list): - start = w["starting_map"] - count = w["num_eval_scenarios"] - print(f" Worker {j}: maps {start}-{start + count - 1} ({count} scenarios)") - - args["vec"] = dict(backend=backend, num_envs=num_workers, num_workers=num_workers, batch_size=num_workers) - - if vecenv is None: - package = args["package"] - module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}" - env_module = importlib.import_module(module_name) - make_env = env_module.env_creator(env_name) - # Pass as lists to preserve per-worker env_kwargs - env_creators = [make_env] * num_workers - env_args = [[]] * num_workers - vecenv = pufferlib.vector.make(env_creators, env_args=env_args, env_kwargs=env_kwargs_list, **args["vec"]) + from pufferlib.ocean.benchmark.manager import EvalManager - policy = policy or load_policy(args, vecenv, env_name) - policy.eval() - num_agents = vecenv.observation_space.shape[0] - device = args["train"]["device"] + args = args or load_config(env_name) - state = {} - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), + if evaluator_name is None: + evaluator_name = args.get("evaluator") + if evaluator_name is None: + raise pufferlib.APIUsageError( + "puffer eval requires --evaluator ; named [eval.] sections live in drive.ini" ) - # Folder for evaluation results - # For inline evaluation during training, use eval_results_dir in experiments folder - # For standalone evaluation, use benchmark folder - if "inline_eval" in args and args["inline_eval"] and "eval_results_dir" in args: - eval_folder = args["eval_results_dir"] - else: - # Standalone evaluation path (in benchmark folder) - model_path = args["load_model_path"] - if model_path is None: - eval_folder = os.path.join("benchmark", "no_policy", args["eval_simulation"]) - else: - model_filename_with_ext = os.path.basename(model_path) - model_name = os.path.splitext(model_filename_with_ext)[0] - models_dir = os.path.dirname(model_path) - experiment_dir = os.path.dirname(models_dir) - experiment_name = os.path.basename(experiment_dir) - eval_folder = os.path.join("benchmark", experiment_name, model_name, args["eval_simulation"]) - os.makedirs(eval_folder, exist_ok=True) - - global_infos = {} - scenarios_processed = 0 - vecenv.async_reset(42) - - ob, _, _, _, infos, _, _ = vecenv.recv() - # Clean eval may use different road-dropout than training. The shared - # training policy's obs slicing needs to be aligned with this env; see - # _swap_policy_obs_counts. - swap_ctx = _swap_policy_obs_counts(policy, vecenv) if clean else contextlib.nullcontext() - with swap_ctx, tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar: - while scenarios_processed < num_scenarios: - # Reset LSTM - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), - ) - - for _ in range(args["env"]["scenario_length"]): - with torch.no_grad(): - ob = torch.as_tensor(ob).to(device) - logits, _ = policy.forward_eval(ob, state) - action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) - action = action.cpu().numpy().reshape(vecenv.action_space.shape) - - if isinstance(logits, torch.distributions.Normal): - action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) - - ob, _, _, _, infos = vecenv.step(action) - - # Multi-worker backend returns infos as list of lists (one per worker) - if infos and infos[0]: - for sub_env in infos: - for env_idx, summary in enumerate(sub_env): - env_map_name = summary["map_name"].split("/")[-1].split(".")[0] - summary["episode_id"] = env_idx - summary["map_name"] = env_map_name - scenarios_processed += 1 - pbar.update(1) - - for k, v in summary.items(): - if k not in global_infos: - global_infos[k] = [] - global_infos[k].append(v) - - avg_infos = _export_metrics( - global_infos, - eval_folder, - num_scenarios, - quiet, - verify_coverage=True, - simulation_mode=args["env"]["simulation_mode"], + manager = EvalManager.from_config(args) + + # Build a fresh vecenv inside the manager via the evaluator's overrides. + # Policy can come from a checkpoint (load_model_path) or be passed in. + if policy is None: + # Need a probe vecenv just to construct the policy with the right + # obs/action spaces. Use the matching evaluator's env_overrides so + # the obs shape matches what the rollout will see. + target = next((e for e in manager.evaluators if e.name == evaluator_name), None) + if target is None: + raise KeyError(f"No [eval.{evaluator_name}] section found. Known: {[e.name for e in manager.evaluators]}") + probe_args = manager._build_eval_args(target, env_name=env_name, global_step=None) + probe_vec = load_env(env_name, probe_args) + policy = load_policy(probe_args, probe_vec, env_name) + probe_vec.close() + + result = manager.run_one_by_name( + evaluator_name, + policy=policy, + env_name=env_name, + logger=None, + global_step=args.get("global_step"), ) - print(f"\nTotal evaluation time: {time.time() - t0:.2f} seconds for {num_scenarios} scenarios.") - _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet) - - # Close vectorized environment to avoid file descriptor leaks - vecenv.close() - - -def eval_multi_scenarios_render( - env_name, - args=None, - vecenv=None, - policy=None, - logger=None, - metric_prefix="validation", - quiet=False, - render_backend="html", - view_mode=0, - video_suffix="", - log_view_label="render", - render_max_steps=None, - render_key_prefix=None, - clean=False, -): - # Set fixed seed for reproducible evaluation - np.random.seed(42) - torch.manual_seed(42) - - if args is None: - tmp_args = load_config(env_name) - model_path = tmp_args.get("load_model_path") - num_agents_eval = tmp_args["eval"]["num_agents"] - map_dir = tmp_args["eval"]["map_dir"] - clean_from_config = tmp_args["eval"].get("clean_eval", False) - eval_overrides = build_eval_overrides( - simulation_mode=tmp_args["eval_simulation"], - num_agents=num_agents_eval, - num_scenarios=tmp_args["num_scenarios"], - map_dir=map_dir, - num_carla_maps=tmp_args.get("num_carla_maps", 8), - clean=clean_from_config, - scenario_length=tmp_args["eval"].get("scenario_length"), - ) - args = load_eval_multi_scenarios_config(env_name, model_path, eval_overrides) - clean = clean or clean_from_config - - backend = args["vec"]["backend"] - if backend != "PufferEnv": - backend = "Serial" - - args["vec"] = dict(backend=backend, num_envs=1) - args["env"]["num_eval_scenarios"] = args["num_scenarios"] # first batch: fill as many scenarios as fit - - # Backend selection. - # "html" — the existing viz.generate_interactive_replay path (CPU-only, - # self-contained HTML per scenario). - # "egl" — the C-side render.h → make_client → client_record_frame - # pipeline (EGL GPU context, PBO double-buffer readback, - # writev → ffmpeg libx264, one mp4 per scenario). - egl_mode = bool(args.get("render")) and render_backend == "egl" - html_mode = bool(args.get("render")) and not egl_mode - if egl_mode: - # Force the C env to RENDER_HEADLESS so make_client spawns ffmpeg and - # (under DRIVE_HAS_EGL) switches the active GL context to the GPU. - args["env"]["render_mode"] = "headless" - - vecenv = vecenv or load_env(env_name, args) - - policy = policy or load_policy(args, vecenv, env_name) - policy.eval() - num_agents = vecenv.observation_space.shape[0] - device = args["train"]["device"] - - state = {} - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), - ) - - # Folder for evaluation results - # For inline evaluation during training, use eval_results_dir in experiments folder - # For standalone evaluation, use benchmark folder - if "inline_eval" in args and args["inline_eval"] and "eval_results_dir" in args: - eval_folder = args["eval_results_dir"] - else: - # Standalone evaluation path (in benchmark folder) - model_path = args["load_model_path"] - if model_path is None: - eval_folder = os.path.join("benchmark", "no_policy", args["eval_simulation"]) - else: - model_filename_with_ext = os.path.basename(model_path) - model_name = os.path.splitext(model_filename_with_ext)[0] - models_dir = os.path.dirname(model_path) - experiment_dir = os.path.dirname(models_dir) - experiment_name = os.path.basename(experiment_dir) - eval_folder = os.path.join("benchmark", experiment_name, model_name, args["eval_simulation"]) - os.makedirs(eval_folder, exist_ok=True) - - saved_cwd = None - mp4_folder = None - gif_folder = None - if html_mode: - gif_folder = eval_folder + "/gif" - os.makedirs(gif_folder, exist_ok=True) - if egl_mode: - mp4_folder = os.path.join(eval_folder, "mp4") - os.makedirs(mp4_folder, exist_ok=True) - # C-side make_client writes .mp4 into the process cwd. We - # chdir into mp4_folder so every scenario's file lands in the right - # place, then restore cwd after the rollout loop. - saved_cwd = os.getcwd() - os.chdir(mp4_folder) - - global_infos = {} - num_scenarios = args["num_scenarios"] - - # Apply per-env video suffix once before any render. make_client reads - # env->video_suffix on the first render to build the ffmpeg filename, so - # this must fire before any step. We don't yet know how many internal - # envs are in the vecenv (vecenv.get_state() only works after reset), - # so set on a generous prefix and let extras be no-ops. - if egl_mode and video_suffix: - _target_env_pre = vecenv if not hasattr(vecenv, "envs") else vecenv.envs[0] - # Drive exposes its internal C-level vec env count via num_envs. - # Use it as the loop bound so we never call set_video_suffix on an - # out-of-range env_id (which would corrupt memory before the C - # bounds check landed). - _internal_num_envs = getattr(_target_env_pre, "num_envs", 1) - for _e in range(_internal_num_envs): - try: - _target_env_pre.set_video_suffix(video_suffix, env_idx=_e) - except Exception: - break - scenarios_processed = 0 - # PufferEnv native backend: vecenv IS the Drive env (no .envs list). - # Serial/Multiprocessing: need vecenv.envs[0] to reach the underlying env. - target_env = vecenv if not hasattr(vecenv, "envs") else vecenv.envs[0] - - # Align the live training policy's obs slicing with the (potentially - # clean) eval env for the render. Same swap as eval_multi_scenarios. - swap_ctx = _swap_policy_obs_counts(policy, vecenv) if clean else contextlib.nullcontext() - with swap_ctx, tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar: - while scenarios_processed < num_scenarios: - ob, _ = vecenv.reset() - - # Get initial states for all environments in the batch - scenarios = vecenv.get_state() - num_envs_in_batch = len(scenarios) - batch_start = scenarios_processed - - # Prepare batch_size_eval for the resample that fires at end of the step loop. - # That resample will load the NEXT batch, so cap it at remaining_after_this. - remaining_after_this = num_scenarios - scenarios_processed - num_envs_in_batch - target_env.batch_size_eval = max(1, remaining_after_this) - - map_names = [] - for env_idx in range(num_envs_in_batch): - map_names.append(scenarios[env_idx]["map_name"].split("/")[-1].split(".")[0]) - - # Reset LSTM - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), - ) - - # Initialize histories as lists of lists (one list per environment). - # Only needed for the HTML replay path — EGL writes mp4 frames - # directly to ffmpeg via c_render each step. - if html_mode: - agent_histories = [[] for _ in range(num_envs_in_batch)] - traffic_histories = [[] for _ in range(num_envs_in_batch)] - trajectory_histories = [[] for _ in range(num_envs_in_batch)] - all_agents_obs_histories = [[] for _ in range(num_envs_in_batch)] - - _render_steps = args["env"]["scenario_length"] - if render_max_steps is not None: - _render_steps = min(_render_steps, render_max_steps) - for t in range(_render_steps): - if html_mode: - current_scenarios = vecenv.get_state() - start_obs_index = 0 - - # Loop through every environment in the batch to record its history - for env_idx in range(num_envs_in_batch): - env_scenario = current_scenarios[env_idx] - - agent_histories[env_idx].append( - pufferlib.viz.fill_agents_state( - env_scenario, use_trajectory="trajectory" in args["env"]["action_type"] - ) - ) - traffic_histories[env_idx].append(pufferlib.viz.fill_traffics_state(env_scenario, t)) - - if "trajectory" in args["env"]["action_type"]: - trajectory_histories[env_idx].append(pufferlib.viz.fill_trajectories(env_scenario, t)) - - # Collect observation dictionaries for ALL active agents in THIS environment at timestep t - if args["render_obs"]: - step_obs_dict = {} - if env_idx > 0: - start_obs_index += current_scenarios[env_idx - 1]["active_agent_count"] - for agent_idx in range(env_scenario["active_agent_count"]): - agent_id = env_scenario["active_agent_indices"][agent_idx] - step_obs_dict[int(agent_id)] = pufferlib.viz.extract_obs_frame( - ob, - env_scenario, - args, - timestep=t, - obs_index=start_obs_index + agent_idx, - agent_idx=agent_idx, - head_north=True, - ) - all_agents_obs_histories[env_idx].append(step_obs_dict) - - with torch.no_grad(): - ob = torch.as_tensor(ob).to(device) - logits, _ = policy.forward_eval(ob, state) - action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) - action = action.cpu().numpy().reshape(vecenv.action_space.shape) + print("EVAL_RESULT_JSON_START") + import json - if isinstance(logits, torch.distributions.Normal): - action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) - - ob, _, _, _, infos = vecenv.step(action) - - if egl_mode: - # Flush one frame per env through c_render → client_record_frame - # → PBO async readback → writev → ffmpeg pipe. make_client is - # called lazily on the first render per env (sets up ffmpeg + - # GPU context) and close_client at scenario end flushes the - # trailing PBO frame. - for e in range(num_envs_in_batch): - target_env.render(env_idx=e, view_mode=view_mode) - - # Serial backend returns infos as single list (infos[0] is the env's info list) - if infos and infos[0]: - for env_idx, summary in enumerate(infos[0]): - env_map_name = summary["map_name"].split("/")[-1].split(".")[0] - summary["episode_id"] = batch_start + env_idx - summary["env_id"] = env_idx - summary["map_name"] = env_map_name - - for k, v in summary.items(): - if k not in global_infos: - global_infos[k] = [] - global_infos[k].append(v) - - if html_mode: - # Loop through every environment to generate its specific HTML replay - for env_idx in range(num_envs_in_batch): - global_episode_id = batch_start + env_idx - # Ensure we don't render padding environments if num_scenarios isn't perfectly divisible by batch_size - if global_episode_id >= num_scenarios: - break - env_map_name = map_names[env_idx] - - pufferlib.viz.generate_interactive_replay( - current_scenarios[env_idx], - agent_histories[env_idx], - traffic_histories[env_idx], - trajectory_histories[env_idx], - all_agents_obs_histories[env_idx], - f"{gif_folder}/{env_map_name}_{global_episode_id:03d}.html", - head_north=True, - ) + print(json.dumps({"name": evaluator_name, "metrics": result.metrics})) + print("EVAL_RESULT_JSON_END") - if egl_mode: - # Close every env's Client so ffmpeg gets EOF on its input pipe, - # the trailing PBO frame is flushed, and libx264 writes the mp4 - # trailer. Without this, the mp4 files are either empty or one - # frame short. - import sys as _sys_cc + if out_path: + with open(out_path, "w") as f: + json.dump( + {"name": evaluator_name, "metrics": result.metrics, "frames": [str(p) for p in result.frames]}, + f, + ) - _sys_cc.stderr.write( - f"[render-instr] starting close_client loop num_envs_in_batch={num_envs_in_batch}\n" - ) - _sys_cc.stderr.flush() - for e in range(num_envs_in_batch): - _sys_cc.stderr.write(f"[render-instr] close_client(env_idx={e}) calling\n") - _sys_cc.stderr.flush() - target_env.close_client(env_idx=e) - _sys_cc.stderr.write(f"[render-instr] close_client(env_idx={e}) returned\n") - _sys_cc.stderr.flush() - - scenarios_processed += num_envs_in_batch - pbar.update(num_envs_in_batch) - - import sys as _sys_instr - - _sys_instr.stderr.write("[render-instr] rollout loop done\n") - _sys_instr.stderr.flush() - - # render_key_prefix overrides metric_prefix for wandb media uploads only. - # This lets callers keep metric_prefix for scalar metrics while using a - # different namespace for renders (e.g. driving_behaviours//render/). - _upload_prefix = render_key_prefix if render_key_prefix is not None else metric_prefix - - if html_mode: - pufferlib.viz.build_gallery_index(gif_folder) - if logger is not None: - try: - import wandb - - html_paths = sorted(os.path.join(gif_folder, f) for f in os.listdir(gif_folder) if f.endswith(".html")) - if html_paths: - step = args.get("global_step") - # Stable key per (category, view); each render epoch overwrites - # the same wandb panel rather than fanning out by scenario UUID. - html_log = {_upload_prefix: wandb.Html(html_paths[-1])} - if hasattr(logger, "log"): - logger.log(html_log, step) if step is not None else logger.log(html_log) - if not quiet: - print(f"Uploaded {len(html_paths)} render HTML(s) to wandb") - except Exception as e: - if not quiet: - print(f"Failed to upload render HTMLs to wandb: {e}") - - if saved_cwd is not None: - os.chdir(saved_cwd) - _sys_instr.stderr.write("[render-instr] chdir restored\n") - _sys_instr.stderr.flush() - - avg_infos = _export_metrics(global_infos, eval_folder, num_scenarios, quiet, verify_coverage=False) - _sys_instr.stderr.write("[render-instr] _export_metrics done\n") - _sys_instr.stderr.flush() - _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet) - _sys_instr.stderr.write("[render-instr] _log_eval_metrics done\n") - _sys_instr.stderr.flush() - - if egl_mode and mp4_folder and logger is not None: - try: - import wandb - - mp4_paths = sorted(os.path.join(mp4_folder, f) for f in os.listdir(mp4_folder) if f.endswith(".mp4")) - if mp4_paths: - # Log under a single stable key per (category, view) so successive - # renders show up in the same wandb panel as a time series. - # The scenario UUID lives in the caption, not the key. - videos = [ - wandb.Video(p, fps=30, format="mp4", caption=os.path.splitext(os.path.basename(p))[0]) - for p in mp4_paths - ] - video_log = {_upload_prefix: videos if len(videos) > 1 else videos[0]} - step = args.get("global_step") - if hasattr(logger, "log"): - logger.log(video_log, step) if step is not None else logger.log(video_log) - if not quiet: - print(f"Uploaded {len(mp4_paths)} render mp4(s) to wandb") - except Exception as e: - if not quiet: - print(f"Failed to upload render mp4s to wandb: {e}") - - # Close vectorized environment to avoid file descriptor leaks - vecenv.close() + return result.metrics def sweep(args=None, env_name=None): @@ -3016,22 +1827,6 @@ def puffer_type(value): args["train"]["use_rnn"] = args["rnn_name"] is not None - # Load driving behaviours eval config if specified - behaviours_config_path = args.get("eval", {}).get("driving_behaviours_eval_config") - if behaviours_config_path: - if isinstance(behaviours_config_path, str): - behaviours_config_path = behaviours_config_path.strip('"').strip("'") - if os.path.exists(behaviours_config_path): - print(f"Loading driving behaviours eval config from {behaviours_config_path}") - bp = configparser.ConfigParser(inline_comment_prefixes=(";", "#")) - bp.read(behaviours_config_path) - behaviours = {} - for section in bp.sections(): - behaviours[section] = {k: puffer_type(v) for k, v in bp[section].items()} - args["driving_behaviours_eval"] = behaviours - else: - print(f"Warning: driving_behaviours_eval_config not found: {behaviours_config_path}") - # Use World size to divide Num_Agents / minibatch size in DDP if "LOCAL_RANK" in os.environ: world_size = int(os.environ.get("WORLD_SIZE", 1)) @@ -3053,12 +1848,22 @@ def main(): if mode == "train": train(env_name=env_name) elif mode == "eval": - eval(env_name=env_name) - elif mode == "eval_multi_scenarios": - eval_multi_scenarios(env_name=env_name) - elif mode == "eval_multi_scenarios_render": - eval_multi_scenarios_render(env_name=env_name) - print("") + # Pull --evaluator and --out from argv before load_config consumes them. + evaluator_name = None + out_path = None + i = 0 + while i < len(sys.argv): + arg = sys.argv[i] + if arg == "--evaluator" and i + 1 < len(sys.argv): + evaluator_name = sys.argv[i + 1] + del sys.argv[i : i + 2] + continue + if arg == "--out" and i + 1 < len(sys.argv): + out_path = sys.argv[i + 1] + del sys.argv[i : i + 2] + continue + i += 1 + eval(env_name=env_name, evaluator_name=evaluator_name, out_path=out_path) elif mode == "sweep": sweep(env_name=env_name) elif mode == "controlled_exp": diff --git a/pufferlib/utils.py b/pufferlib/utils.py index a425dcef3..9efc0c628 100644 --- a/pufferlib/utils.py +++ b/pufferlib/utils.py @@ -8,309 +8,6 @@ import json -def run_human_replay_eval_in_subprocess(config, logger, global_step): - """ - Run human replay evaluation in a subprocess and log metrics to wandb. - - """ - try: - run_id = logger.run_id - model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}") - model_files = glob.glob(os.path.join(model_dir, "model_*.pt")) - - if not model_files: - print("No model files found for human replay evaluation") - return - - latest_cpt = max(model_files, key=os.path.getctime) - - # Prepare evaluation command - eval_config = config["eval"] - cmd = [ - sys.executable, - "-m", - "pufferlib.pufferl", - "eval", - config["env"], - "--load-model-path", - latest_cpt, - "--eval.wosac-realism-eval", - "False", - "--eval.human-replay-eval", - "True", - "--eval.human-replay-num-agents", - str(eval_config["human_replay_num_agents"]), - "--eval.human-replay-control-mode", - str(eval_config["human_replay_control_mode"]), - ] - - # Run human replay evaluation in subprocess - result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd()) - - if result.returncode == 0: - # Extract JSON from stdout between markers - stdout = result.stdout - if "HUMAN_REPLAY_METRICS_START" in stdout and "HUMAN_REPLAY_METRICS_END" in stdout: - start = stdout.find("HUMAN_REPLAY_METRICS_START") + len("HUMAN_REPLAY_METRICS_START") - end = stdout.find("HUMAN_REPLAY_METRICS_END") - json_str = stdout[start:end].strip() - human_replay_metrics = json.loads(json_str) - - # Log to wandb if available - if hasattr(logger, "wandb") and logger.wandb: - logger.wandb.log( - { - "eval/human_replay_collision_rate": human_replay_metrics["collision_rate"], - "eval/human_replay_offroad_rate": human_replay_metrics["offroad_rate"], - "eval/human_replay_completion_rate": human_replay_metrics["completion_rate"], - }, - step=global_step, - ) - else: - print(f"Human replay evaluation failed with exit code {result.returncode}: {result.stderr}") - - except subprocess.TimeoutExpired: - print("Human replay evaluation timed out") - except Exception as e: - print(f"Failed to run human replay evaluation: {e}") - - -def run_wosac_eval_in_subprocess(config, logger, global_step): - """ - Run WOSAC evaluation in a subprocess and log metrics to wandb. - - Args: - config: Configuration dictionary containing data_dir, env, and wosac settings - logger: Logger object with run_id and optional wandb attribute - epoch: Current training epoch - global_step: Current global training step - - Returns: - None. Prints error messages if evaluation fails. - """ - try: - run_id = logger.run_id - model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}") - model_files = glob.glob(os.path.join(model_dir, "model_*.pt")) - - if not model_files: - print("No model files found for WOSAC evaluation") - return - - latest_cpt = max(model_files, key=os.path.getctime) - - # Prepare evaluation command - eval_config = config.get("eval", {}) - cmd = [ - sys.executable, - "-m", - "pufferlib.pufferl", - "eval", - config["env"], - "--load-model-path", - latest_cpt, - "--eval.wosac-realism-eval", - "True", - "--eval.wosac-num-agents", - str(eval_config.get("wosac_num_agents", 256)), - "--eval.wosac-init-mode", - str(eval_config.get("wosac_init_mode", "create_all_valid")), - "--eval.wosac-control-mode", - str(eval_config.get("wosac_control_mode", "control_wosac")), - "--eval.wosac-init-steps", - str(eval_config.get("wosac_init_steps", 10)), - "--eval.wosac-goal-radius", - str(eval_config.get("wosac_goal_radius", 2.0)), - "--eval.wosac-sanity-check", - str(eval_config.get("wosac_sanity_check", False)), - "--eval.wosac-aggregate-results", - str(eval_config.get("wosac_aggregate_results", True)), - ] - - # Run WOSAC evaluation in subprocess - result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd()) - - if result.returncode == 0: - # Extract JSON from stdout between markers - stdout = result.stdout - if "WOSAC_METRICS_START" in stdout and "WOSAC_METRICS_END" in stdout: - start = stdout.find("WOSAC_METRICS_START") + len("WOSAC_METRICS_START") - end = stdout.find("WOSAC_METRICS_END") - json_str = stdout[start:end].strip() - wosac_metrics = json.loads(json_str) - - # Log to wandb if available - if hasattr(logger, "wandb") and logger.wandb: - logger.wandb.log( - { - "eval/wosac_realism_meta_score": wosac_metrics["realism_meta_score"], - "eval/wosac_ade": wosac_metrics["ade"], - "eval/wosac_min_ade": wosac_metrics["min_ade"], - "eval/wosac_total_num_agents": wosac_metrics["total_num_agents"], - }, - step=global_step, - ) - else: - print(f"WOSAC evaluation failed with exit code {result.returncode}") - print(f"Error: {result.stderr}") - - # Check for memory issues - stderr_lower = result.stderr.lower() - if "out of memory" in stderr_lower or "cuda out of memory" in stderr_lower: - print("GPU out of memory. Skipping this WOSAC evaluation.") - - except subprocess.TimeoutExpired: - print("WOSAC evaluation timed out after 600 seconds") - except MemoryError as e: - print(f"WOSAC evaluation ran out of memory. Skipping this evaluation: {e}") - except Exception as e: - print(f"Failed to run WOSAC evaluation: {type(e).__name__}: {e}") - - -def run_driving_behaviours_eval_in_subprocess(config, logger, global_step, behaviours_config): - """ - Run driving behaviours evaluation for each of the specified scenario classes in a subprocess. - - For each class defined in behaviours_config, calls `puffer eval puffer_drive` with: - - simulation_mode=replay, control_mode=control_sdc_only, init_mode=create_all_valid - - map_dir and num_agents from the class config - Parses HUMAN_REPLAY_METRICS_START/END JSON from stdout and logs to wandb under - driving_behaviours//. - """ - sampled_dirs = [] # temp symlink dirs created for num_scenarios sampling - try: - run_id = logger.run_id - model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}") - model_files = glob.glob(os.path.join(model_dir, "models", "model_*.pt")) - - if not model_files: - print("DrivingBehavioursEval: no model files found, skipping.") - return - - latest_cpt = max(model_files, key=os.path.getctime) - EVAL_SECTIONS_PREFIX = "eval_" - classes = [(name, cfg) for name, cfg in behaviours_config.items() if name.startswith(EVAL_SECTIONS_PREFIX)] - - all_results = {} - for class_name, class_cfg in classes: - map_dir = class_cfg.get("map_dir", "") - if isinstance(map_dir, str): - map_dir = map_dir.strip('"').strip("'") - if not os.path.isdir(map_dir): - print( - f"DrivingBehavioursEval [{class_name[len(EVAL_SECTIONS_PREFIX) :]}]: map_dir not found, skipping ({map_dir})" - ) - continue - all_bins = [f for f in os.listdir(map_dir) if f.endswith(".bin")] - if not all_bins: - print( - f"DrivingBehavioursEval [{class_name[len(EVAL_SECTIONS_PREFIX) :]}]: no .bin files in {map_dir}, skipping" - ) - continue - # Optional cap: random-sample N bins each eval pass via a fresh - # symlink dir. Different scenes per pass; better population estimate - # without paying for the full directory. - num_scenarios = class_cfg.get("num_scenarios") - if num_scenarios and int(num_scenarios) < len(all_bins): - k = int(num_scenarios) - sampled = random.sample(all_bins, k) - tmp_dir = tempfile.mkdtemp(prefix=f"db_eval_{class_name}_") - for fname in sampled: - os.symlink(os.path.join(map_dir, fname), os.path.join(tmp_dir, fname)) - map_dir = tmp_dir - sampled_dirs.append(tmp_dir) - num_agents = len([f for f in os.listdir(map_dir) if f.endswith(".bin")]) - scenario_length = class_cfg.get("scenario_length", 201) - short = class_name[len(EVAL_SECTIONS_PREFIX) :] - - cmd = [ - sys.executable, - "-m", - "pufferlib.pufferl", - "eval", - config["env"], - "--load-model-path", - latest_cpt, - "--eval.wosac-realism-eval", - "False", - "--eval.human-replay-eval", - "True", - "--eval.map-dir", - map_dir, - "--eval.human-replay-num-agents", - str(num_agents), - "--eval.human-replay-control-mode", - str(config["eval"].get("human_replay_control_mode", "control_sdc_only")), - "--env.simulation-mode", - "replay", - "--env.init-mode", - "create_all_valid", - "--eval.scenario-length", - str(scenario_length), - # Clean-eval overrides. Mirrors build_eval_overrides(clean=True): - # red lights enforced, no road-segment dropout, no partner - # blindness or phantom braking, wider partner budget. Subprocess - # re-parses the ini so training-time CLI overrides don't leak in - # here. (eval_mode is on ev/clean-eval branch, not this one.) - "--env.traffic-light-behavior", - "1", - "--env.lane-segment-dropout", - "0.0", - "--env.boundary-segment-dropout", - "0.0", - "--env.partner-blindness-prob", - "0.0", - "--env.phantom-braking-prob", - "0.0", - "--env.phantom-braking-trigger-prob", - "0.0", - "--env.max-partner-observations", - "32", - ] - - print(f"DrivingBehavioursEval: running class '{short}' with map_dir={map_dir}") - try: - result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd()) - if result.returncode == 0: - stdout = result.stdout - if "HUMAN_REPLAY_METRICS_START" in stdout and "HUMAN_REPLAY_METRICS_END" in stdout: - start = stdout.find("HUMAN_REPLAY_METRICS_START") + len("HUMAN_REPLAY_METRICS_START") - end = stdout.find("HUMAN_REPLAY_METRICS_END") - metrics = json.loads(stdout[start:end].strip()) - all_results[class_name] = metrics - print(f"DrivingBehavioursEval [{short}]: {metrics}") - else: - print(f"DrivingBehavioursEval [{short}]: no metrics found in output") - else: - print( - f"DrivingBehavioursEval [{short}]: subprocess failed (exit {result.returncode}): {result.stderr[-500:]}" - ) - except subprocess.TimeoutExpired: - print(f"DrivingBehavioursEval [{short}]: timed out") - except Exception as e: - print(f"DrivingBehavioursEval [{short}]: error: {e}") - - # Log all class results to wandb - if hasattr(logger, "wandb") and logger.wandb and all_results: - payload = {} - for class_name, metrics in all_results.items(): - short = class_name[len(EVAL_SECTIONS_PREFIX) :] - for k, v in metrics.items(): - try: - payload[f"driving_behaviours/{short}/{k}"] = float(v) - except (TypeError, ValueError): - pass - if payload: - payload["train_step"] = global_step - logger.wandb.log(payload, step=global_step) - - except Exception as e: - print(f"DrivingBehavioursEval: unexpected error: {e}") - finally: - for d in sampled_dirs: - shutil.rmtree(d, ignore_errors=True) - - def render_videos(config, vecenv, logger, epoch, global_step, bin_path): """ Generate and log training videos using C-based rendering. From 88153155038c04fc3e28c8d3b9a464d582bf83fc Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:32:19 -0400 Subject: [PATCH 04/26] [WIP] eval: explicit goal_advance_mode knob; replaces if-SIMULATION_REPLAY in c_step MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add GOAL_ADVANCE_REGENERATE / GOAL_ADVANCE_SATURATE constants and a goal_advance_mode field on Drive. c_step's last-goal branch now dispatches on goal_advance_mode instead of simulation_mode. Drive.__init__ accepts goal_advance_mode kwarg with auto-pick based on simulation_mode (gigaflow → regenerate, replay → saturate) — same behavior as before, but the choice is explicit and per-eval-overridable. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/drive/binding.c | 1 + pufferlib/ocean/drive/drive.h | 20 ++++++++++++++++---- pufferlib/ocean/drive/drive.py | 17 +++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c index b2a4c20b1..0dd93a5e5 100644 --- a/pufferlib/ocean/drive/binding.c +++ b/pufferlib/ocean/drive/binding.c @@ -1788,6 +1788,7 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) { env->init_mode = (int)unpack(kwargs, "init_mode"); env->control_mode = (int)unpack(kwargs, "control_mode"); env->simulation_mode = (int)unpack(kwargs, "simulation_mode"); + env->goal_advance_mode = (int)unpack(kwargs, "goal_advance_mode"); env->reward_conditioning = (bool)unpack(kwargs, "reward_conditioning"); env->reward_randomization = (bool)unpack(kwargs, "reward_randomization"); env->compute_eval_metrics = (bool)unpack(kwargs, "compute_eval_metrics"); diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h index 17ee20987..701eb1b9f 100644 --- a/pufferlib/ocean/drive/drive.h +++ b/pufferlib/ocean/drive/drive.h @@ -64,6 +64,16 @@ #define SIMULATION_GIGAFLOW 0 #define SIMULATION_REPLAY 1 +// Goal advance modes — chosen when the SDC reaches the last goal in its +// sequence. REGENERATE recomputes a fresh set along the route (the +// gigaflow training pattern). SATURATE leaves the goal queue at its +// final state so the reached-goal condition won't fire again (the +// replay-mode pattern, where regenerating would dereference NULL paths +// for nuPlan bins without route info). Defaults to REGENERATE for +// gigaflow and SATURATE for replay; the Python config layer chooses. +#define GOAL_ADVANCE_REGENERATE 0 +#define GOAL_ADVANCE_SATURATE 1 + // Lane selection scoring #define LANE_SELECTION_DISTANCE_WEIGHT 0.7f #define LANE_SELECTION_HEADING_WEIGHT 0.3f @@ -336,6 +346,7 @@ struct Drive { int init_mode; int control_mode; int simulation_mode; + int goal_advance_mode; int termination_mode; float inactive_agent_threshold; int reward_conditioning; @@ -4866,10 +4877,11 @@ void c_step(Drive *env) { if (agent->current_goal_idx == env->num_target_waypoints) { // Last goal reached env->logs[i].num_goals_reached += 1; - if (env->simulation_mode == SIMULATION_REPLAY) { - // Replay mode: leave current_goal_idx saturated so the - // reached-goal condition won't fire again. Re-generating - // route-based goals on WOMD maps fails (removed=1). + if (env->goal_advance_mode == GOAL_ADVANCE_SATURATE) { + // Leave current_goal_idx saturated so the reached-goal + // condition won't fire again. Used by replay evals where + // regenerating route-based goals on WOMD/nuPlan bins + // would fail (path NULL or removed=1). } else { compute_goals(env, agent_idx); } diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 7dc537ca5..894448dca 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -57,6 +57,7 @@ def __init__( action_type="discrete", dynamics_model="classic", simulation_mode="gigaflow", + goal_advance_mode=None, termination_mode=0, inactive_agent_threshold=0.4, buf=None, @@ -229,6 +230,21 @@ def __init__( else: raise ValueError(f"simulation_mode must be one of 'gigaflow' or 'replay'. Got: {self.simulation_mode_str}") + # goal_advance_mode controls what happens when the SDC reaches the + # last goal in its sequence. None → auto-pick based on simulation_mode + # (gigaflow=regenerate, replay=saturate). Explicit values: "regenerate" + # or "saturate". + if goal_advance_mode is None: + self.goal_advance_mode = 1 if self.simulation_mode == 1 else 0 + elif goal_advance_mode == "regenerate": + self.goal_advance_mode = 0 + elif goal_advance_mode == "saturate": + self.goal_advance_mode = 1 + else: + raise ValueError( + f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}" + ) + if self.control_mode_str == "control_vehicles": self.control_mode = 0 elif self.control_mode_str == "control_agents": @@ -387,6 +403,7 @@ def _env_init_kwargs(self, map_file, max_agents): "init_mode": self.init_mode, "control_mode": self.control_mode, "simulation_mode": self.simulation_mode, + "goal_advance_mode": self.goal_advance_mode, "reward_conditioning": self.reward_conditioning, "reward_randomization": self.reward_randomization, "compute_eval_metrics": self.compute_eval_metrics, From a6dd740a907cf582107478352624257f3db981d6 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:32:56 -0400 Subject: [PATCH 05/26] README: cluster section now framed for NYU specifically Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c1e2e137e..2fc82a481 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ python setup.py build_ext --inplace --force ## Install (HPC cluster) -For clusters where the host glibc is too old or you need a CUDA toolchain that's not pinned by the OS, PufferDrive uses a **mixed Singularity + venv** layout: +For the NYU cluster, PufferDrive recommends a **mixed Singularity + venv** layout: - **Singularity image** (read-only, system-wide): supplies CUDA + cuDNN. - **ext3 overlay** (writable via `--fakeroot`, host the miniforge3 base interpreter at `/ext3/miniforge3` only). From 170183b32749fc466b505699c73fb9cbba158c9a Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:38:09 -0400 Subject: [PATCH 06/26] ruff-format: collapse goal_advance_mode raise to one line Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/drive/drive.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 894448dca..1d9c4d081 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -241,9 +241,7 @@ def __init__( elif goal_advance_mode == "saturate": self.goal_advance_mode = 1 else: - raise ValueError( - f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}" - ) + raise ValueError(f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}") if self.control_mode_str == "control_vehicles": self.control_mode = 0 From eb21afd77b5e457492b0152d2d8615fecfa9bf11 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:46:22 -0400 Subject: [PATCH 07/26] [WIP] eval: factor shared rollout loop into Evaluator base class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HumanReplay and MultiScenario rollouts had ~80% the same step loop (reset → forward_eval → step → collect infos → aggregate). Move it to the base class as `_run_rollout_loop`. Subclasses override only the hooks they actually need to diverge on: _initial_reset (sync vs async reset) _maybe_reset_lstm (per-scenario reset cadence) _should_stop (termination condition) _flatten_infos (multi-worker vs single-worker info shape) _aggregate_infos (per-key mean is the default) _render_pass (no-op default; MultiScenario implements EGL) HumanReplayEvaluator now overrides only `env_overrides` + `_should_stop`. BehaviorClassEvaluator still inherits HumanReplay; only adds the tmp symlink dir for random sampling. WOSAC keeps its custom rollout — its per-scene multi-rollout loop doesn't fit the default shape. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../__pycache__/__init__.cpython-313.pyc | Bin 0 -> 1183 bytes .../__pycache__/base.cpython-313.pyc | Bin 0 -> 9080 bytes .../behavior_class.cpython-313.pyc | Bin 0 -> 3386 bytes .../__pycache__/human_replay.cpython-313.pyc | Bin 0 -> 2580 bytes .../multi_scenario.cpython-313.pyc | Bin 0 -> 9406 bytes .../__pycache__/wosac.cpython-313.pyc | Bin 0 -> 2709 bytes pufferlib/ocean/benchmark/evaluators/base.py | 115 ++++++++++++--- .../benchmark/evaluators/behavior_class.py | 15 +- .../benchmark/evaluators/human_replay.py | 58 ++------ .../benchmark/evaluators/multi_scenario.py | 136 ++++++------------ 10 files changed, 156 insertions(+), 168 deletions(-) create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4d8790db2fa093442f5ecb0224609017549d6ba GIT binary patch literal 1183 zcma)5&2AGh5ccM$n{1jKswyCH@v%X$E5r$@Dk^ORBrPr3wg?e$on0qcW!JkJ@22XJ zhu|SN@eVuz2M*OrJ#pgJ(gR1Ztu{$-^w~$=lzx@k-mJs@-f}2&o zcCJ22=L>QXAlGmUU87TgLZ=8ttm0y~)G0$*`=#zu#{^URlzKHH`Yz zez|Y_A8N+_;2}i+^ObOyqg=9y*)K3&qOfd81yL(r6Rx8gS85|r9|m5k%ehqoe9)3^5zRRrqdvXEhW_A$?B^tqP)kF zke)8?H+N-`!1WlX5X6fojdy$Q=GJ_tls6)kOT=|@-WXh?5&bUt@>pRI&>(sFBo@?b znfXflXmjUq^I)$>dhM65-GknHD9T1a&LaX-88wNpq)bJcH?2l)5+YS1ByS~pCcJVZlr+H;Piv28P8*pFZ$-j)mGiff0hGAS5Or!L71sOZWIXXH=J#E_; i7P|NNtoi6{b?vPF;Idk77Ji_WKaEPQ@Y#f>y!{_QRD>%4 literal 0 HcmV?d00001 diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f0d416d296e9d3d20ebe3c8cd6143e5b68ff8b1 GIT binary patch literal 9080 zcmb7KZERcDdA|InC`yz_eOR_^UD01+6j63#Df4G+M~-7VmK`xn)JOw!MP5obrtW3V zCFRInQ6mLbrppj{scQx45M_U&76s<50p@>ohXUJAvCKy0#_No>+mK&VVgo_9AKUYu z%S%$Uf*=Rbx#yebocDd7_v7e6ON&oH`rkYMr%Y`Vgx^!eO0G)b!HX!|6k-AsVz!w5 zg6+JW+0Q$eL!`Rng7dtKxkO>wzD)=VG3Rz6<{Gh^b4MymFgNdWqt9dYG0&_w(sb?V zGYe8CFXc2Q&Pci}CNq+*i-TfHPD}YrPGnjpqvdmAM$@urrqL6!PCY3pC-KqI&**0& zcB8493YXFk~rL=;Z8GY zUeq<({mX`l~oPh&XkhO8EBSN zRm({^MN{=eg3(zFXGYO;teJ|PiG)tQ$%K@{Zf5d1nTGk%_W*x_F*O|5Wu_0y`B_<& z7Zg>==~otq$7KCVPRkBIt)=AQ%lUL#W@AiQkcYFpoKa?mwWKVm!!xp)oST=}m0{VU zojy#&i)I%Y-2>04pm!nrfpEX2{reX`32j^LD2AffpDB#}t24?vt$~{e<3=l`2fsnq zNSK(NIb#myns!fnh&!jf7C}HH_jFUtGu?E^9rJD%UUf2GtO+%H%*SiZm0B~e`B7^@ zpP$!SD}61z7KpWO7g$@REx_B_V{K>)qAi43?Yyl6wNBK6yw(*9O?UD+&R7R;>8|v5 z^4hjo7wehrj%+iUsto^G3(g>&vI)ZJ8t11W2vQ~^)~+_{^TqNroYQ4p%q?b>BnB*s z+JelOl9EMDMeUV@KBwg~DbQZaz9J6f=44Z5ax$A&RL(8Fq^716;+u#lsVVUlSzSn2 z%jmB}#TPKi61-O`lJ|N$*C|uosNUOpouB47>0;xrp=4AYik2s;$^0!7^xY zQ_17Q&5GR)`B33rmG5qB&1rG2an$ zZZZGse$^FoTeI+MUh6l$cDJ6G*P2(i+v>TdNJx45irqHFe8aXaRv;ka>1%h;yybln zSn)I~MEv81M^=g7QZE=@MFm3xb{LVwZ~{9)X+4ueb;x7@25%dwx6TAYvgH^GQT5UZTp|2pjF5K_f_Eyt|Tj<{TRx>_3BOiNv9=e5H z;gx+KKK8+5tKYukEQODkcAh9W-}2wvDXw&Vxc!6etLN|7O8rMlJCCBdWxc2G{q65= zUwLNr*;3C4TAJ?lS_7ZGEtmR_m3oh(x%pmr-^#*=*FLy*J9_u%-J#O{%cb!11t)&; z-j2Q%_lLd@e5+619xI85OFNDfoPX)#SWUN^%QH1ul?&{=DnZYoa#NV9_Qi#Xs?4CP zk|wHhV6{M-+71Yf9JB}8{Mz>~%WS9$gW?px>QAdEphv|q(?S3wYU0&7DJKI*IDqH? z1z?=-D${Z@UE3lU*Vx zMQm&kADvEHA#}?oC^fok)ZWxFMAL}lt_vVDP040(i%+8W~9X-J+}xoLPXX$PR zXK6f)R6yb;)7CkoE__Be2|Lm=BPFi@N(`@>pHF~*y5YdNBM!rz&tet1juq^j6$~fY zKRU6zCHgIxQB$t<9I>h6TbQCpkzE%yd_wR3+oy_w(Smc`>wmlX&E`k=<74medLUG2 zHaUdA#!4hwT$#Or!c74jumxXC2IGzq8ik}fx8{-ovs;jY0BO*+4DQ>@1UKanTL7Pac9f zBhN_-3Rq}Ck+`_eY1$PX%^-7Bykr?gL|Y%s@i{F86T`F-FKAV+7TViqW*Ct)ay^YA*9Ao2eb@8uVb;lhsJMl}*LAmMy zIMK|8G*PurY8^2fpWy|=Rc(h_@8lbk`bdS@)h3LoT2XalLY5X$ePGGvm`(L#WS};} zDoq}Z`z;}`nEk9RrE_*Cq;E)*KdA`~pwTr!mOw+YJYBoMM^x>m+OW9Kvalw)Ypqr( z?y-8+7T8*38Z@NIPptFh0W;bEj=LweHw-~_f^3?G6mN|=W3I#YJOp2D69qL8Z=2|A z7=)^IhOL03+SmW+JWx6OXzNx~uAZ?ihIm4(s`o`@n{apv)IV#xXma_Jbw5xk#+$A6_Xz#Mmq2`w}$G@E~w9%cPXiB^aTw!v?u3u~dRg55tpPG`H2FOY>QT zVlvt+bb{eXX7h#zKSf%?@FS2@+oMl`VS+e|iv6~vDELB=7WC_edt77lSk{@$C|Sb+ zy<|A&l-W5r3~-?hxBNPGk}_Ho=?v)*6^DYqjN?|-p4k!!a_zuQ;s)6%%1$FQ>}r;s zr`iR|&Qg<)>=X&n9BkS}!5`Uh;989UUm311SFm!5I_0cxIJKEX&Tv89W8c8R~Jg%qf5^9-f+2hxY#><+ga*8cB5&@z1}xa z?mJNIJ5cUBQS3W$XJIY=?X|uWYke=@@GrTSZOaEMPS$(9<(|P}&tR!%sN6GB>>0V8 zE{~ooj-D%zUMh}WD)n4ma(~$*1p6QQgtqO=`&Rar!z0D;NI8707(P}G94`irulMYD zzx&#yRUpYUOXKypMJS``sGs3)ROytc;Lgq4+dA$B`o6a zD?Gi_zI1BowdK9*{vDMwA1d!2E$$w@oiFV^{;~hWy1#wt`8UrO&TQ=9HbC7~eH=Ws zu?wz8Ik2Y~*t3!-1xD7qBVYDlvp4(xM<>ntkAc&nvj-jjbkKM9DNnLuE}~r!pm|Iq z^Dz`)WNztvSRlsr)SLh}Mfz44ik4z`~f%B`$rk7APw$-s|=~?Wc=9>`QdK#4!;dBSfXCzJAUjw2CUMW?=bA7> z7Lnn)CNp4E^PG}O0kL%Dnp`nnS(-+2c%sEPM)8S_Rx2B)g|;`4gyXasol()RPa?Z6 zz{goS`{sAc0kIemS9;2U!D3)=J=9q^2R!*hAhaIZcI)Vkqs!OIgU5@5$M2?>j;@6+ zZr%F1%hBfB5FAYv!dC1^{Ei?HsW@S&@uo0qL)_uWvv4SQXp{3?4i2sOTNO<-PtQ|+(QQvnFQNp^2lVhs7vSle-qa|4Eu&S$qSN(kfmVHHBMd9KhK57e* z+%yVY-YmcdBeAkwyKY7g@H&Fn5sw-WYEhs}lofk~1tU@M8PihWQ7Us7!tL}Nq(?y}qP-cOoIwf7Je!4qVbM>cHV59& z42PbdX`o+{I_qI_lTC6Ry*im0a3EDLBK55$q$8mZ0 zPjaR{S#Jy7^4;()Kk?(h{aW$!7NMgD?r2-6g074wOO>Zz6WfoqZXR5^wL)FYQ9_&r&*&$DL~=`=goohT-5WT zTelAjx*ct>-c|Qhv$Y5KVq18mahJ7^DUwm1YCL+V`c6NC157ppg)C8Lg5n_(Q#2i>C7}i&!+hb18r@&f&lS%2rkEy?kldgL zCz0*OK|yu_mkYhDwQr?cEz&FqH(d}{MA+#dv%-@OI3%`l;FuFT)uGSq(E|oNdSWmu zX3}(wUOQjUvJ=Ko9bPrq+|gqh4bN-(iD-5BkFf}U$ovdqu+}^Bv;1oR`!Bxx;`@`! zlRwM%xCRYCDEjUt0?uT=O4< zRenD(@bkSNIc}f%g|GD3(Y4lN1;@QW@GtUjf9HqaDfW*1G+*i+S)(}Qr|q4EQ=f$Q zmqO!zvv>LVpC7p`{e%2>@|{z^nEQ3;*YaI|dGO*pV@s!gH1@&X!q{49{NE9s%-!qR zwanhX`tH@0+M1Ysq!SHR z0uGqEfkH^68#vycn&59dB1Qc+=`2zP#K?%t|r%!)oTw0 zd^Wmvx<>DGJQVQx;MXxy9+Tzg zZnzM`d(8YNjT7uW%p}g#3Ar{LHk<7e;oN@-1OF!MfQqy?e`&YdhBpLc4;-j&Q1wgS PgxwbW0#9N#b7%h#2{9Vj literal 0 HcmV?d00001 diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae87773ca68e75824b82e22934aa030ae1b091c4 GIT binary patch literal 3386 zcmb7GU2Gf25#Hk+$rD9NhG@&OOiD*Zg2bvNZC9?KL^b@QII;yv4jDQsq~?4gm()q* z9lLi_BBKTLR;n*`gBX_D08-EbQQuOuK;@!#btldb=%oExEkvWeZC^}+6K7~Ds3L{OZIJH|M|S=2!s z7Ucbxp}iX@ZS8i6~l1&q;=o%V$+BTa?Mg% zHkeTov}KB;2?G-&iVGR6;YFt0SXhdVU}M45^U-U13P*1h(rHXah`NZQd25_iXQO%& zD_V3GYstBsLKdRfevk%ytUc7)ev$kV@xq3*^LTeg>@&51kkJm^SIcNy^sTr{V%zG; zjUPS}2dV+7D10QI*#$15E`rSh(UKgS)td;^@^c#cBYqZZe!f^2)wl!~V=g$-Sa@En zdtP%S9DQ}Z(v8rx%NCBYosP6vo ze0|MSuWj@HzZT9QeO+V1ZELitgvOPczn`-L(hozvsq^uMI`vpc$X?(v_mSXga?j!I ztu-7Hyb?`Fqj2K{b?1~kLm-N%zz~4?4&?#D!s)IMfh<5d8Nl6@CpfLXMLkSR0n9NC zDzY)?F~WYr0*U&twkYe1m=HCE4a#S*NgcXDg}h?UQKxFK4S7{F3A;_|&Z>sVK8aXM z8E>i}!)LJuFkz)E>Z&0n6-}JWr`0S5s~1viLKu=07LL?iPfS#p*HtatKv-Z>9xNg3 zS(3AC%B88OWV13$YOsQ(LY$(`G(%lt!B+!_F=S1vmYIq}7Zs{w!|JL04?(BMIEwi3m%Wb|==@=+=3{?HC z#hd?XJG(lvKDIWtKDjozdHsW{<-lMi@KP!8($?gkUM&YE%Wb!czU|J?`rV)3-RynV zd78jc;o>XXQeQevU?mP#V%=vD0`zFJ;ZA`fZ@Qi$ z5Q-g4^z|h%_tCWgfXC{zfpAYZMCbG#z{ISI^RO-p;0}MXW-M_HL0DR)!yZ3_*e}da zyasXSQ!|=QunaM!6C>=h9z9`jM_ynZq&Uc{#$3Tvvk*8rlVBX?tROI08k4NIOx?*W zWc@;(c0A`j*ZKXh#2AHc8C4sbD~$p8|ElHv)9;;rKlom7)2u`=B?=Q9YZw);4Uam^r#gUJ^!5t^+y2?Fi3=oD`U=WMda1*P8Sj<5kBFbM^|qG?d1vNG2IzEz|MDK%+Y zF-wM+=qV_dm`GAqOp~ay1rviT_0@~1BnCJDV5>zx^VH>9)U$elu#Q2D@lEKy`Z20@ zMwf3C-&*BYepD8sUkDCoXh-xr{j27e2*{V;;GI1?2s-Py*7C52x@9?~C&57Ly24EH zAnTZusSC>TyyFr x!Y+{=%R0gi&vD#m4xSt3c0Gvme2je7zxl7|>?bJpxvP`wc@jMT1p;bm`yZi5`9A;v literal 0 HcmV?d00001 diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1786e70901f1ae0130b6edeca7ba39c583906961 GIT binary patch literal 2580 zcmZt|TWk|YaPQ&sBTgL%2_&K9NJGIu>_QSD5<)~FJVHUpCnF?Mj*D;YTshyF-E$yR zmH2D;B9V%4q)J5nN~KDrRjcyNSNm05BVny*gV)-Btw~&Si(gaOBDG&j$bW$9bh@^6vd{Q0{kbsId zp&q0KI+3PK1O}y=JyCLEJXjP3gpL>UhLget+n5=fHteF|xkUZ_`e!QfWoq6vvARpm zxQ<6$Tc>75cO83Xx9U1r1&QqzJ+**IWYTg}_<2)U%^L;Xw8(BXj}1q4vwXh~7c9zs zSx(kfO)KNkL?m*;nZm^KsBdQCtWmVRhF9Bl3u>0QdG%r=w2Q-Vusx%?)0hy;#58hI zr&DgxHg)Q`g^Q}^svahJ%P~9*su`?eP@PeEcvA*d3&b^x8EpFT^H>s3c}`Bp1r|DN z8Z=neUc}GOl1R`xinn>h{z=XmtX(Fge+W~9xL})TH(kQhW-w>bDSkhC?J`){n<-%3G4hxx1bao|WZ()L z5CV!vxcmi3p*o;4G)a@SfTn1{gyINkA+=h6#W0OhV#7y+NY!htA%U+6y@v)@bO%>F zLA8a`!Z2{gO@!ctH`ztBrn;RYro|mAK8^GtF^Q=DU%C^H^l-L zg<-%X8#^p$Th!xR2snaLE}S<=qGZWAm_$Wj@w3cxbSG{uA_aS;N+)BLOG zrJ3XrOfNyZClAAyC(jhKSxk-)YZ@mDK5bi*NjC$-Cpn3o%v9bWmy$dS{&b_s)jgFc z%&=zoT)i<-SkIfe^H8+Vi+~`*-jEF#hfMy(f zzgK}Z!*g8+-i!VO`4Na)==@5L`xPwkO4dZa*6ZAnU)M-+`!&F)P`&>fwt?4ewZJRh zm2+c#@D^%HANWfveX!{S&vO+YUe1BEGV~QfBSL*1nD9+1I=7EYdr>@i+OIC?nGALe zV!1lhVh&0(;&%!4aDn=bMWCki$5 z!zK&Vob2T4yZFLAv2fnF3hxrQr+l{YisnGx<(tvf4;S0|#-dxT(gQ}+f#o`Sp`flU zC3A6b;nIVh!{sdpDxD*pfZ2sfL zq4LH=r8PMlsMDNI;!iJi-yf#CE#MZp0x<|Y|FuptLr2T=7iS-lc-1d zzTr5o=MNeR1~UIIK=^cI@}%q9WDh6XbjTa%WOT#xh&5UCFesA&?yR3LJcWdptH1Es zPkHh`Mb+-U>nCQfmZTe>ROG&Aa!a7KPw{y`kq+;9)^APefc>1N;n z80Ls6@XT6Ht`>E?F;Wu#NJ%tCO5){6NlY3rrg$gSVtO@B7CKPN)8L literal 0 HcmV?d00001 diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2bc93b735098c57ffe89fb4a697cd6da16b85db1 GIT binary patch literal 9406 zcmb7KTWlLwdLG{I;$5UJmPR*A)P<5_%h%Xm+p;5Bwj6Uju~W%A9+4x8Yifp>A!RAq z-586-F0&{QX`9p(g4POJAX=agD%uC-p@`9 zBX3LUBhK7E=rHU`E}cygafYS&6n9~XPUq+>$K$`b_W_rp2?)?l_4R!l4kHlG~B%w8K@vc?ew@>2S62tSU1oVHK5*D~zIzb6pE^{02i_;l=ZpB_5 zZ=B1q*~lQ2OR=eJicV8JBQRMgNee4%97YB)d>+K54oejZjDhEJ$%RY~>e_Epi}cDI zqt>L-LUs|VKPQ#T#Pus`Q_vNf@NFo}CK#T|(1HM^k%Z0<2{2^)h1V|Qzxm0}@%j11 z43oqcGl%g+ZZQ*yK1Rzt9ns0g*FkRZwVqEy?Kfy1!nW%4JK~WW>+i?MUg%zKP{d4W>m%E; zD;b7j=|x61^Gr6!vvJtbRfpz>w*S*F!21v*u~;>0FA%F7ki8U`r)~rr)2k_U^Yw&5 z%V%{n+HplaXfoZ`T-_GFtp_7v(s~%H%!$?C^bzK$z8i~Jre(tt6PGP>bo>tN71{QB zZhoHOFR)9pmCY?u47(&eM)e|kz5zbG2ZfEYGt1K{mWp%id@3m$l1x@Mq}Z%%R#PGd z*(fmS`Mo*<5)Y2!GBZpHaNviTT#{j#C3GN!J1fKEjBp1I&+vIJ!3?W|P!9KSMj22# zH8;$~;UXBGW7zn@BF*0!MrT&JWQ5`8*E5<~kzFt?s(M1@?)uvy`dq013{iLkynEPJ zzQ&TTOZ0V>e7&Ntchj`VOTJf1z9XXV$kxik0m(O=x9s>ESF`uuS$pUHyKC=msDagyy4e0L zh^m5vjZxW7YFzbW)s3^4N|^Sxp0@BfM(SYE&C@(O1w3l$BCM1R36fd_!ONa3!!M>- z<>FLR0z7k<<`WbT7YetSdRxho4X`hura6~^OP1k-s5w7)uuB?IxE(wJwU6U`2tKk4 zD*PVH@m#59)W~7*U^$wa%Xmk*JMzS0bldZWT{~uTzHfiezIyon$+eUB&#ax<7~AZW z{6mkd(X!i@xAUlWeE*rWK!S3LmmvVBxW0h(^-me0uRTRbnDqod2CXBPgZ@|pj%k0a zku-qt4b$jkFp0SH+ z1U`aF4TD#^p57NUz1;V*C9TB-1E@U)ynEQcTQN_tVA}A0WG_2?JIx0QiH))KI}f;Z zt{6S@&?~i_mzu}(?y{q~a7}WwZH$RsC$?#+>rC0{UG01CYRTClIy)XYyPk{x(%H3Z z!n(hsD>~Z#j}4HAF?_gxk$?Z2YrjgXvxw-9d~r+6pyRDLQm@^2r18XGU>F=aBOE$( z$;mFH)3gN?lxr|zRM)F|hsg$5*vgVAizK^K>^vucvP-9#c$P`<=u(o+SvZqSws!TL z`>~+h^|FQA2N;h)Yr-+`?qNIL!0Pq;Gix*VsWocjjm?-894Q44i^0QN)I(YdPDtL# zym`m%TRkJWyEc|Y_fXL~^x`r&pf%;L`vdS*#WIDlaB2|vXHBG`-pz$nk0w~wOqvPf ziMw#dVCxTq5aE<6Cx*4`6LtM|%-RB+s-FJ@T*q+g>X*0FDG$8@8L?_Pgl!1s#9B#f z%m#C-aX?0ZjvUv&vYnjw8THnR zbno4ZtOsKL>Wl&~qaf*4X5=MJF?1W%xERvOHI8%+>xQvm9qAg@5sgGh`3oPnu)dil zttjaunlw3TH+ur4NcyfekiHmFxiyZ^AqlYlnFCq|=_i_rkVYrjzzo6)iX#J>q(BCM zCfFe9k9BCddtcH^28kB5+E?JvI?%qtVqJUrT8Q+K{+nH-hqR7W&@|f!qwLZ0_rCk| zLPHEKPmLoRwUI%eVx`XpxL8_q{{}^`?on67Q}Lw~a~Bv`I*s22E}3E#eg%}0!)aWA zDA3PDiU+oNE|*FtfC>Y-WQ2uE-cW`8f!fCf4zhrj;J_$>mN0lqzy;>59K#|OX-6av zbPA990Hu+ED+-7vBF`uh8qVS)3IpF`eigp}hp?IK^}9B5ES4YH8~N zLnqQOX4z30jDl}%LJIDGMGIp*(v>_qt*Ra{t20o5mL1LI4glYzut#BJ2htuDgN1CL zqqFe^N=UuUfN}wvq)?+@=wu73n@TK40s%EN6id zMnWp2N9SEOXSj4KzM^6m#FY3O$Wxf%Ii)W!-=Ov8CsE9y&Pd%AicBk0w4z}maw&T& zGedJ$1uom*X4#-+JM=berpW_PrTo^)>mv9*d!(2p}eK+@U1eE zqowQ*<|n`MhYEJl-<7xS_z+cGURy5Who#V1DKxts_$>TsxOncmbex1}@y%JucdK}t z5q%7_Vs*dode2ozY+ld1MC&V+ZVJaYl9De9`2(+PC8dE=Vq<2A<^3R zRiH6%D+j`*K+mH}(ilM>H>szBR zT1vi7(buW+Tk%Of9AtF`Zf)l zZDazj z^z=%eGfxayW6!P`3pT;1#X!$SeDlcGk%!^Wk4ga|f1&Jjzd!ljJ3{aE>P!t9z3K>GN zDLrcgHlpws!l39nJMhAazJa<`7qeD_)j&l%I%cY<12mblnq3XVs+gjh1U9rif?qe^ zM3^;Qt!XdT2?(cNTUGqB53#i&1h#KvM#75bSGPWxgJ0;&HB&`5u_~6?8-FLJXd)UqtM2X z0~Eu0Ss#>Y*~`^JTB+x8OjW7@(n)%Tzzk0Ykk1D9Fb!cQgPLSWeYz?U(yieq`XJPE z6AneTU0RQOU#)LIIc~!F?WJaVw7R5A8=)U$FmVY+<^%j(2j6IY=n~~(HGOm5aA*kj_Y%98rd+8<`Iq^ zda;}~k9wxFYMvI@R~=Rzk+2$9@5E|SV8aAFrw~=^E}WF(TJ;+tkRh_H)VLVZA2m*s zNp&}z1X)>PXKIQ;^hWq4qx{Z`WnWSfY3LeP#o?&+3pJS#y%E-CoP({$dU4nxNdEhF zSuJXBBK5J?UTleMB29#QKWcwXcjbzTrC-t-oLv^RhOk79Bbx}%aUGy>xGxk2;DNIT zfMD;`)38^d2RQlPdb>1ebA(l#(mEo%&2aC;V1%`f@M-1!)jLc}>Bc&+K}=iCgbu^z z^*Gk9NbNP1egyIt)TU>g6$@vSRS)KX)7{oc(}?&wmI=8~muj zqkG%b9|fin>6J}@;Nfw=1W-A?pkP$Guw^G5 zNAI4LIv0Bi7$-0u7Ut!uKEy&kOlF$n7ok~WJe|sb)r-3;8y8Z^1u#&J`?!H!wTB^8&nMB$kQ&)TRvyVc-*}pO+f+_H9#yCWXI*b8-*+ zm9_a-)&L?3-r#C-?Lc8!^!DY=7@X*#-F?b|z-wC%A#ve?rPd&K0 zes%NW);X#5XsPw2*m`pN>gU&_)~izZ^}Gu(`u&NuiQ<7_DKJtBP=9@X+qg}B>iEq4 zsk=DwW^v}0bnLs*VJdI?I?#B3^5>I(G?lmQSncmye`Z|`J+e0K>RW8Mg7hO|AhOxN zJud~O04;ZR6`k#6U(crNk#DRVZhPQdcYf5cW?I#+o-Ma_mRbkJ)}X@Qm1c<{{)wL&ng^mUnAX8almdf8^=^7k{`MXe#urT`dJrUylLQv0cbvE1EX>OLxVAB71s zQup|}tzh2Jm3v1@y~o7fW2N4UV(-O=OT}AsvG-!JcW&K@@;64y9lf74f86|Mt)-4b zV#gt=W31G1O6)kbJzqL~O+0(OsIV4i>4 z{$I~ptL1;`Z2wOIfy2Kuor_*Ri2ePcbHU32)92RnK8Sr0uwU-7ebHt`;e+AJrw#lM zq3m=dC|fCo;8T=rrz$oK2sh;v zKFzWm*eTG}1o?`l3Zp_D$Qwi+LT<8gj^on&DJ7UjKP<_jEak2SjgP6PG)O78Zz@h9 z!-w+I6Vxagu}b(5f#5Bghn_@y5LyH1PA zkHYZzU8B+D`<>Hf>fUYCo1)v}&oKDxI!8@STM&lN?!2zwbnKz9i@~QzzNvn{JNFHW zt7A~QM_oTgD)liMJv>Grz#$vvm?hZ(@8JpVF5Kj)MR=iSmMAccMC`JKq7qyjrt?!& z6bxnA1+Xxw79}(Mk^!zdRkD$)%}KSHMXPV;QKV1QxAW*64f+qjT}2+ZQ0RF<$<|j@ zyUl6-Zy{F){{mWoU4u@id!o1K4m|Z?I{UA&@UO9!f5px`Gf(Js#|m#g!_as4rV;b` W%MNGRA1J#$Pi&^3Uc|hL4E`5tJ&=$9 literal 0 HcmV?d00001 diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..474555509212136bcdfd777fcb79cb65cbb4d07f GIT binary patch literal 2709 zcma)8OKcm*8J^|7Na|V2acW0xSF$Q;VTp_c(?MQ!Q#nQ~C34IS?KT#&STBd-R?Ast zW>&TW1n@2LDK_B1I{MIqdP{%;=`lqwO;Mm?89>6ohqOge_*4gqQRI~Vvm|Ai%^?F~ zXa1T0@y+}%?hg!P5sb0leH;A8KBpg4xkl(x+06ejNQ;qZy#G zKp6?u=a{sAHEn0a;FAykU=Rd_el232FEV_ifnDO&u`noxdlMhre=s1*kOZ|K!Uv)X zBFa%6&CwkruM?%Lln!%u#LB3Y>MRdUrEY{C+H=fjQL)2Y>}ax3S(!j~dRYwk5#it$ zw+(gJ2iJ4#LcqE3l0oHxl%P`loUM!gE_XeuV37$(z}@yzMpb(`3=XOX#N$Ck+B-)m zzUI-2OCc_zdhlLyBvee+ZB-a?qXHI%0jjIPs!&%k2S;iQhgJF%*k#hVWr>wSnPJKb zIEJji>(Vw`+bEpHY>h{a!U7;Fyct)kn4YDj1Q{5^`FK5}tE-gjIRMg8B!WX$%w8)_I>9JIrDt^Y9+J*T>9SdR zx`!3WF82t!l)jW<>a!4eBj~jWgktbW<7PR1O=bZ(eb1*Rz?y}aN-`u}MRxykl5@5~r5E3VD%VfqI;Mf| zMkL%2M}Ts@q&>J$W!gc;MPPjZ8L`>v572d65?!kUUu{iqw zdoJS@p#>b*Atg=V1prJKRSLC?*@NQ5L6YT@B+DNpSzZ=|q|GEZ5a|w$g;s}b6ef?{ zO5k&$SHsBTLS=DX=mE5p>rfIBxI$+&uYutRK%CLZOl{v literal 0 HcmV?d00001 diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py index 066c1919d..77a7ad95e 100644 --- a/pufferlib/ocean/benchmark/evaluators/base.py +++ b/pufferlib/ocean/benchmark/evaluators/base.py @@ -1,7 +1,6 @@ -"""Evaluator base class + EvalResult dataclass.""" +"""Evaluator base class + default rollout loop + EvalResult dataclass.""" from dataclasses import dataclass, field -from pathlib import Path from typing import ClassVar @@ -14,9 +13,11 @@ class EvalResult: class Evaluator: """Base class for all evaluators. - Subclasses set `type_name` (the value used in `[eval.].type`) and - implement `rollout()`. Optionally override `env_overrides()`, - `vec_overrides()`, and `aggregate()`. + Subclasses typically override only `_should_stop` (the loop termination + condition) and `env_overrides`. The default `rollout` runs a step loop + suitable for "stream of episode infos until target count reached" evals. + + To diverge from the default loop entirely, override `rollout` directly. """ type_name: ClassVar[str] = "" @@ -39,6 +40,8 @@ def __init__(self, name: str, config: dict, train_config: dict): self.render_views: list = list(config.get("render_views", ["sim_state"])) self.clean: bool = bool(config.get("clean", True)) + # -- Config hooks --------------------------------------------------- + def env_overrides(self) -> dict: """Per-evaluator [env] overrides. Defaults to whatever the section wrote under `env.*`. Subclasses can override to add baseline knobs.""" @@ -53,24 +56,104 @@ def vec_overrides(self) -> dict: base.update(self.config.get("vec", {})) return base + # -- Rollout (default) ---------------------------------------------- + def rollout(self, vecenv, policy, args) -> EvalResult: - raise NotImplementedError + """Default rollout: reset → step → collect infos → aggregate. - def aggregate(self, per_rollout: list) -> dict: - """Reduce a list of per-rollout dicts to a single metrics dict. + Subclasses tune behavior via the hooks below. Override this + method directly only if the loop shape itself needs to differ + (e.g. per-scene multi-rollout patterns). + """ + metrics = self._run_rollout_loop(vecenv, policy, args) + frames = self._render_pass(vecenv, policy, args) if self.render else [] + return EvalResult(metrics=metrics, frames=frames) - Default: numeric mean over keys present in any sub-dict. WOSAC - overrides for likelihood-style aggregation.""" + def _run_rollout_loop(self, vecenv, policy, args) -> dict: import numpy as np + import torch + + import pufferlib + + device = args["train"]["device"] + num_agents = vecenv.observation_space.shape[0] + state = self._init_lstm_state(num_agents, policy, device, args) + + obs = self._initial_reset(vecenv, args) + + infos_collected: list = [] + steps = 0 + while not self._should_stop(args, infos_collected, steps): + self._maybe_reset_lstm(state, steps, args) + + with torch.no_grad(): + ob_t = torch.as_tensor(obs).to(device) + logits, _ = policy.forward_eval(ob_t, state) + action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) + action = action.cpu().numpy().reshape(vecenv.action_space.shape) + if isinstance(logits, torch.distributions.Normal): + action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) + + obs, _, _, _, infos = vecenv.step(action) + infos_collected.extend(self._flatten_infos(infos)) + steps += 1 + + return self._aggregate_infos(infos_collected) + + # -- Loop hooks (subclass-overridable) ------------------------------ - if not per_rollout: + def _initial_reset(self, vecenv, args): + """Return the initial observation. Default: synchronous reset.""" + obs, _ = vecenv.reset() + return obs + + def _init_lstm_state(self, num_agents, policy, device, args) -> dict: + if not args["train"].get("use_rnn"): return {} - keys = set() - for r in per_rollout: - keys.update(r.keys()) - out = {} + import torch + + return dict( + lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), + lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), + ) + + def _maybe_reset_lstm(self, state, steps, args): + """Hook for resetting LSTM state mid-rollout. Default: no-op.""" + pass + + def _should_stop(self, args, infos_collected, steps) -> bool: + """Loop termination. Subclasses must override.""" + raise NotImplementedError + + def _flatten_infos(self, infos) -> list: + """Pufferlib backends return either a list-of-list (multi-worker) or + a single list (PufferEnv backend). Flatten to a list of dicts.""" + out = [] + if not infos: + return out + for sub in infos: + if not sub: + continue + if isinstance(sub, list): + out.extend(sub) + else: + out.append(sub) + return out + + def _aggregate_infos(self, infos: list) -> dict: + """Default: numeric mean per key, plus a num_scenarios_completed count.""" + if not infos: + return {"num_scenarios_completed": 0} + import numpy as np + + out = {"num_scenarios_completed": float(len(infos))} + keys = set().union(*(d.keys() for d in infos)) for k in keys: - vals = [r[k] for r in per_rollout if k in r and isinstance(r[k], (int, float))] + vals = [d[k] for d in infos if isinstance(d.get(k), (int, float))] if vals: out[k] = float(np.mean(vals)) return out + + def _render_pass(self, vecenv, policy, args) -> list: + """Render hook. Subclasses that support frame capture override this.""" + return [] diff --git a/pufferlib/ocean/benchmark/evaluators/behavior_class.py b/pufferlib/ocean/benchmark/evaluators/behavior_class.py index 2d3a7f45c..ef8425fa7 100644 --- a/pufferlib/ocean/benchmark/evaluators/behavior_class.py +++ b/pufferlib/ocean/benchmark/evaluators/behavior_class.py @@ -1,8 +1,7 @@ """BehaviorClassEvaluator — one nuPlan behavior category at a time. -Runs a HumanReplayEvaluator-style rollout against a single map_dir, with -optional fresh random sampling each pass when `num_scenarios` < total bins. -""" +Inherits HumanReplayEvaluator's loop. Adds optional fresh random sampling +each pass when `num_scenarios` < total bins (via a tmp symlink dir).""" import os import random @@ -10,7 +9,6 @@ import tempfile from typing import ClassVar -from pufferlib.ocean.benchmark.evaluators.base import EvalResult from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator @@ -22,9 +20,6 @@ def __init__(self, name, config, train_config): self._sampled_dir = None # tmp symlink dir created per pass def env_overrides(self) -> dict: - # Reuse HumanReplay's defaults, then handle the random-sampling - # cap. If num_scenarios is smaller than total bins, build a tmp - # symlink dir with a fresh sample each pass and point map_dir there. env = super().env_overrides() map_dir = env.get("map_dir", "") if not map_dir or not os.path.isdir(map_dir): @@ -45,12 +40,6 @@ def env_overrides(self) -> dict: env["num_maps"] = len(all_bins) return env - def rollout(self, vecenv, policy, args) -> EvalResult: - result = super().rollout(vecenv, policy, args) - # Manager owns the cleanup window — defer rmtree until after vecenv.close - # so any open file descriptors on the symlinks are released first. - return result - def cleanup(self): if self._sampled_dir and os.path.isdir(self._sampled_dir): shutil.rmtree(self._sampled_dir, ignore_errors=True) diff --git a/pufferlib/ocean/benchmark/evaluators/human_replay.py b/pufferlib/ocean/benchmark/evaluators/human_replay.py index 28e19e22b..10f06b277 100644 --- a/pufferlib/ocean/benchmark/evaluators/human_replay.py +++ b/pufferlib/ocean/benchmark/evaluators/human_replay.py @@ -1,14 +1,13 @@ """HumanReplayEvaluator — replay mode + control_sdc_only, one rollout per -bin in the map_dir, mean of per-episode info dicts.""" +bin in the map_dir, mean of per-episode info dicts. + +Inherits the default rollout loop from `Evaluator`; only overrides +`_should_stop` to terminate once every bin has produced one info.""" import os from typing import ClassVar -import numpy as np -import torch - -import pufferlib -from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator +from pufferlib.ocean.benchmark.evaluators.base import Evaluator class HumanReplayEvaluator(Evaluator): @@ -32,48 +31,11 @@ def env_overrides(self) -> dict: env["num_maps"] = env["num_agents"] return env - def rollout(self, vecenv, policy, args) -> EvalResult: - device = args["train"]["device"] + def _should_stop(self, args, infos_collected, steps) -> bool: + # Stop once every bin has yielded one info, OR after a step budget + # generous enough to give every bin a chance (env auto-resamples). scenario_length = int(args["env"]["scenario_length"]) init_steps = int(args["env"].get("init_steps", 0)) num_maps = int(args["env"]["num_maps"]) - num_agents = vecenv.observation_space.shape[0] - - # +1 step margin: env emits done on the step after scenario_length. - total_steps = (scenario_length - init_steps + 1) * num_maps - - obs, _ = vecenv.reset() - state = {} - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), - ) - - all_infos = [] - for _ in range(total_steps): - with torch.no_grad(): - ob_t = torch.as_tensor(obs).to(device) - logits, _ = policy.forward_eval(ob_t, state) - action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) - action_np = action.cpu().numpy().reshape(vecenv.action_space.shape) - if isinstance(logits, torch.distributions.Normal): - action_np = np.clip(action_np, vecenv.action_space.low, vecenv.action_space.high) - obs, _, _, _, info_list = vecenv.step(action_np) - if info_list: - all_infos.extend(info_list) - # Stop once every bin has yielded one info to avoid double-counting - # on the second cycle through the dir. - if len(all_infos) >= num_maps: - break - - if not all_infos: - return EvalResult(metrics={"num_scenarios_completed": 0}) - - metrics = {"num_scenarios_completed": float(len(all_infos))} - keys = set().union(*(d.keys() for d in all_infos)) - for k in keys: - vals = [d[k] for d in all_infos if isinstance(d.get(k), (int, float))] - if vals: - metrics[k] = float(np.mean(vals)) - return EvalResult(metrics=metrics, frames=[]) + max_steps = (scenario_length - init_steps + 1) * num_maps + return len(infos_collected) >= num_maps or steps >= max_steps diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index 365505bfd..0703e6567 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -1,21 +1,21 @@ """MultiScenarioEvaluator — distribute scenarios across workers, one rollout -per scenario, mean per-scenario metrics.""" +per scenario, mean per-scenario metrics. Drives both the gigaflow validation +path and replay-style multi-scenario evals. + +Inherits the default loop from `Evaluator`; overrides `_should_stop` (cap by +scenario count), `_initial_reset` (async reset for multi-worker throughput), +`_maybe_reset_lstm` (per-scenario LSTM reset), and `_render_pass` (the C-side +EGL → ffmpeg mp4 dump).""" -import contextlib import os -import time from pathlib import Path +from typing import ClassVar -import numpy as np -import torch -import tqdm - -import pufferlib -from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator +from pufferlib.ocean.benchmark.evaluators.base import Evaluator class MultiScenarioEvaluator(Evaluator): - type_name = "multi_scenario" + type_name: ClassVar[str] = "multi_scenario" def vec_overrides(self) -> dict: # Multi-worker by default for throughput. Override via [eval..vec]. @@ -24,8 +24,6 @@ def vec_overrides(self) -> dict: return {"backend": backend, "num_envs": num_envs} def env_overrides(self) -> dict: - # Sensible defaults for the gigaflow path; replay configs are expected - # to set the relevant knobs in [eval..env.*]. env = { "eval_mode": 1, "termination_mode": 0, @@ -34,94 +32,50 @@ def env_overrides(self) -> dict: env.update(self.config.get("env", {})) return env - def rollout(self, vecenv, policy, args) -> EvalResult: - t0 = time.time() - num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1)) - scenario_length = int(args["env"].get("scenario_length", 91)) - device = args["train"]["device"] - num_agents = vecenv.observation_space.shape[0] - - global_infos = {} - - # LSTM hidden state shared across the rollout; reset each scenario batch. - state = {} - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), - ) + # -- Loop hooks -- + def _initial_reset(self, vecenv, args): + # Multi-worker async reset gives us the parallel-throughput path. vecenv.async_reset(args.get("seed", 42)) - ob, _, _, _, infos, _, _ = vecenv.recv() - scenarios_processed = 0 - with tqdm.tqdm(total=num_scenarios, desc=f"[{self.name}] scenarios", disable=args.get("quiet", False)) as pbar: - while scenarios_processed < num_scenarios: - if args["train"]["use_rnn"]: - state["lstm_h"].zero_() - state["lstm_c"].zero_() - - for _ in range(scenario_length): - with torch.no_grad(): - ob_t = torch.as_tensor(ob).to(device) - logits, _ = policy.forward_eval(ob_t, state) - action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) - action = action.cpu().numpy().reshape(vecenv.action_space.shape) - if isinstance(logits, torch.distributions.Normal): - action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) - - ob, _, _, _, infos = vecenv.step(action) - - if infos and infos[0]: - for sub_env in infos: - for env_idx, summary in enumerate(sub_env): - map_name = summary["map_name"].split("/")[-1].split(".")[0] - summary["episode_id"] = env_idx - summary["map_name"] = map_name - scenarios_processed += 1 - pbar.update(1) - for k, v in summary.items(): - global_infos.setdefault(k, []).append(v) - - metrics = self._average(global_infos) - if not args.get("quiet", False): - print(f"[{self.name}] {scenarios_processed} scenarios in {time.time() - t0:.1f}s") - - frames = [] - if self.render: - frames = self._render_pass(vecenv, policy, args) - - return EvalResult(metrics=metrics, frames=frames) - - def _average(self, global_infos: dict) -> dict: - out = {} - import numbers - - for k, vs in global_infos.items(): - if k == "num_scenarios": - out[k] = float(np.sum(vs)) - elif vs and isinstance(vs[0], numbers.Number): - out[k] = float(np.mean(vs)) - return out + ob, _, _, _, _, _, _ = vecenv.recv() + return ob + + def _maybe_reset_lstm(self, state, steps, args): + # Reset between scenarios — gigaflow's auto-resample fires at the + # end of scenario_length, so steps % scenario_length == 0 is the + # natural boundary. No-op when LSTM is unused. + if not state or steps == 0: + return + scenario_length = int(args["env"].get("scenario_length", 0)) + if scenario_length > 0 and steps % scenario_length == 0: + state["lstm_h"].zero_() + state["lstm_c"].zero_() + + def _should_stop(self, args, infos_collected, steps) -> bool: + target = int(self.config.get("eval", {}).get("num_scenarios", 1)) + return len(infos_collected) >= target + + # -- Render -- def _render_pass(self, vecenv, policy, args) -> list: """One rollout per view, all writing mp4s to a single dir. - Re-uses the same vecenv if it's a single-worker setup; otherwise - delegates to a serial render env built fresh per view. + Builds a fresh single-worker env so frame capture is sequential + and starting_map_counter starts at 0 — the C-side ffmpeg-per-env + wiring assumes one bin at a time per process. """ import importlib - env_name = args["env_name"] + import pufferlib + backend = args.get("render_backend", "egl") if backend != "egl": return [] + env_name = args["env_name"] out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4" out_dir.mkdir(parents=True, exist_ok=True) - # Render with a fresh single-worker env so frame capture is sequential - # and starting_map_counter starts at 0. Multi-worker render doesn't - # match the C-side ffmpeg-per-env wiring cleanly. package = args.get("package", "ocean") module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}" env_module = importlib.import_module(module_name) @@ -155,6 +109,11 @@ def _render_pass(self, vecenv, policy, args) -> list: return all_paths def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: Path) -> list: + import numpy as np + import torch + + import pufferlib + device = args["train"]["device"] num_agents = vecenv.observation_space.shape[0] num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1)) @@ -163,12 +122,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: saved_cwd = os.getcwd() os.chdir(out_dir) try: - state = {} - if args["train"]["use_rnn"]: - state = dict( - lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device), - lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device), - ) + state = self._init_lstm_state(num_agents, policy, device, args) scenarios_processed = 0 while scenarios_processed < num_scenarios: ob, _ = vecenv.reset() @@ -176,7 +130,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: num_in_batch = len(scenarios) remaining = num_scenarios - scenarios_processed - num_in_batch target_env.batch_size_eval = max(1, remaining) - if args["train"]["use_rnn"]: + if state: state["lstm_h"].zero_() state["lstm_c"].zero_() for _ in range(max_steps): From aeb9dd565a2ff7ca1d5e38fa1602fc68537d0f48 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:47:31 -0400 Subject: [PATCH 08/26] [WIP] eval: untrack accidentally-committed pycache; harden gitignore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark/** unignore was over-broad — it pulled in pycache files that should never be committed. Add an explicit re-ignore for __pycache__ under that path. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 2 ++ .../__pycache__/__init__.cpython-313.pyc | Bin 1183 -> 0 bytes .../evaluators/__pycache__/base.cpython-313.pyc | Bin 9080 -> 0 bytes .../__pycache__/behavior_class.cpython-313.pyc | Bin 3386 -> 0 bytes .../__pycache__/human_replay.cpython-313.pyc | Bin 2580 -> 0 bytes .../__pycache__/multi_scenario.cpython-313.pyc | Bin 9406 -> 0 bytes .../evaluators/__pycache__/wosac.cpython-313.pyc | Bin 2709 -> 0 bytes 7 files changed, 2 insertions(+) delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc diff --git a/.gitignore b/.gitignore index 782cfdf36..430b374ec 100644 --- a/.gitignore +++ b/.gitignore @@ -150,6 +150,8 @@ experiments/ benchmark*/ !pufferlib/ocean/benchmark/ !pufferlib/ocean/benchmark/** +# But re-ignore caches inside it +pufferlib/ocean/benchmark/**/__pycache__/ wandb/ .neptune/ raylib*/ diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index e4d8790db2fa093442f5ecb0224609017549d6ba..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1183 zcma)5&2AGh5ccM$n{1jKswyCH@v%X$E5r$@Dk^ORBrPr3wg?e$on0qcW!JkJ@22XJ zhu|SN@eVuz2M*OrJ#pgJ(gR1Ztu{$-^w~$=lzx@k-mJs@-f}2&o zcCJ22=L>QXAlGmUU87TgLZ=8ttm0y~)G0$*`=#zu#{^URlzKHH`Yz zez|Y_A8N+_;2}i+^ObOyqg=9y*)K3&qOfd81yL(r6Rx8gS85|r9|m5k%ehqoe9)3^5zRRrqdvXEhW_A$?B^tqP)kF zke)8?H+N-`!1WlX5X6fojdy$Q=GJ_tls6)kOT=|@-WXh?5&bUt@>pRI&>(sFBo@?b znfXflXmjUq^I)$>dhM65-GknHD9T1a&LaX-88wNpq)bJcH?2l)5+YS1ByS~pCcJVZlr+H;Piv28P8*pFZ$-j)mGiff0hGAS5Or!L71sOZWIXXH=J#E_; i7P|NNtoi6{b?vPF;Idk77Ji_WKaEPQ@Y#f>y!{_QRD>%4 diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc deleted file mode 100644 index 5f0d416d296e9d3d20ebe3c8cd6143e5b68ff8b1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9080 zcmb7KZERcDdA|InC`yz_eOR_^UD01+6j63#Df4G+M~-7VmK`xn)JOw!MP5obrtW3V zCFRInQ6mLbrppj{scQx45M_U&76s<50p@>ohXUJAvCKy0#_No>+mK&VVgo_9AKUYu z%S%$Uf*=Rbx#yebocDd7_v7e6ON&oH`rkYMr%Y`Vgx^!eO0G)b!HX!|6k-AsVz!w5 zg6+JW+0Q$eL!`Rng7dtKxkO>wzD)=VG3Rz6<{Gh^b4MymFgNdWqt9dYG0&_w(sb?V zGYe8CFXc2Q&Pci}CNq+*i-TfHPD}YrPGnjpqvdmAM$@urrqL6!PCY3pC-KqI&**0& zcB8493YXFk~rL=;Z8GY zUeq<({mX`l~oPh&XkhO8EBSN zRm({^MN{=eg3(zFXGYO;teJ|PiG)tQ$%K@{Zf5d1nTGk%_W*x_F*O|5Wu_0y`B_<& z7Zg>==~otq$7KCVPRkBIt)=AQ%lUL#W@AiQkcYFpoKa?mwWKVm!!xp)oST=}m0{VU zojy#&i)I%Y-2>04pm!nrfpEX2{reX`32j^LD2AffpDB#}t24?vt$~{e<3=l`2fsnq zNSK(NIb#myns!fnh&!jf7C}HH_jFUtGu?E^9rJD%UUf2GtO+%H%*SiZm0B~e`B7^@ zpP$!SD}61z7KpWO7g$@REx_B_V{K>)qAi43?Yyl6wNBK6yw(*9O?UD+&R7R;>8|v5 z^4hjo7wehrj%+iUsto^G3(g>&vI)ZJ8t11W2vQ~^)~+_{^TqNroYQ4p%q?b>BnB*s z+JelOl9EMDMeUV@KBwg~DbQZaz9J6f=44Z5ax$A&RL(8Fq^716;+u#lsVVUlSzSn2 z%jmB}#TPKi61-O`lJ|N$*C|uosNUOpouB47>0;xrp=4AYik2s;$^0!7^xY zQ_17Q&5GR)`B33rmG5qB&1rG2an$ zZZZGse$^FoTeI+MUh6l$cDJ6G*P2(i+v>TdNJx45irqHFe8aXaRv;ka>1%h;yybln zSn)I~MEv81M^=g7QZE=@MFm3xb{LVwZ~{9)X+4ueb;x7@25%dwx6TAYvgH^GQT5UZTp|2pjF5K_f_Eyt|Tj<{TRx>_3BOiNv9=e5H z;gx+KKK8+5tKYukEQODkcAh9W-}2wvDXw&Vxc!6etLN|7O8rMlJCCBdWxc2G{q65= zUwLNr*;3C4TAJ?lS_7ZGEtmR_m3oh(x%pmr-^#*=*FLy*J9_u%-J#O{%cb!11t)&; z-j2Q%_lLd@e5+619xI85OFNDfoPX)#SWUN^%QH1ul?&{=DnZYoa#NV9_Qi#Xs?4CP zk|wHhV6{M-+71Yf9JB}8{Mz>~%WS9$gW?px>QAdEphv|q(?S3wYU0&7DJKI*IDqH? z1z?=-D${Z@UE3lU*Vx zMQm&kADvEHA#}?oC^fok)ZWxFMAL}lt_vVDP040(i%+8W~9X-J+}xoLPXX$PR zXK6f)R6yb;)7CkoE__Be2|Lm=BPFi@N(`@>pHF~*y5YdNBM!rz&tet1juq^j6$~fY zKRU6zCHgIxQB$t<9I>h6TbQCpkzE%yd_wR3+oy_w(Smc`>wmlX&E`k=<74medLUG2 zHaUdA#!4hwT$#Or!c74jumxXC2IGzq8ik}fx8{-ovs;jY0BO*+4DQ>@1UKanTL7Pac9f zBhN_-3Rq}Ck+`_eY1$PX%^-7Bykr?gL|Y%s@i{F86T`F-FKAV+7TViqW*Ct)ay^YA*9Ao2eb@8uVb;lhsJMl}*LAmMy zIMK|8G*PurY8^2fpWy|=Rc(h_@8lbk`bdS@)h3LoT2XalLY5X$ePGGvm`(L#WS};} zDoq}Z`z;}`nEk9RrE_*Cq;E)*KdA`~pwTr!mOw+YJYBoMM^x>m+OW9Kvalw)Ypqr( z?y-8+7T8*38Z@NIPptFh0W;bEj=LweHw-~_f^3?G6mN|=W3I#YJOp2D69qL8Z=2|A z7=)^IhOL03+SmW+JWx6OXzNx~uAZ?ihIm4(s`o`@n{apv)IV#xXma_Jbw5xk#+$A6_Xz#Mmq2`w}$G@E~w9%cPXiB^aTw!v?u3u~dRg55tpPG`H2FOY>QT zVlvt+bb{eXX7h#zKSf%?@FS2@+oMl`VS+e|iv6~vDELB=7WC_edt77lSk{@$C|Sb+ zy<|A&l-W5r3~-?hxBNPGk}_Ho=?v)*6^DYqjN?|-p4k!!a_zuQ;s)6%%1$FQ>}r;s zr`iR|&Qg<)>=X&n9BkS}!5`Uh;989UUm311SFm!5I_0cxIJKEX&Tv89W8c8R~Jg%qf5^9-f+2hxY#><+ga*8cB5&@z1}xa z?mJNIJ5cUBQS3W$XJIY=?X|uWYke=@@GrTSZOaEMPS$(9<(|P}&tR!%sN6GB>>0V8 zE{~ooj-D%zUMh}WD)n4ma(~$*1p6QQgtqO=`&Rar!z0D;NI8707(P}G94`irulMYD zzx&#yRUpYUOXKypMJS``sGs3)ROytc;Lgq4+dA$B`o6a zD?Gi_zI1BowdK9*{vDMwA1d!2E$$w@oiFV^{;~hWy1#wt`8UrO&TQ=9HbC7~eH=Ws zu?wz8Ik2Y~*t3!-1xD7qBVYDlvp4(xM<>ntkAc&nvj-jjbkKM9DNnLuE}~r!pm|Iq z^Dz`)WNztvSRlsr)SLh}Mfz44ik4z`~f%B`$rk7APw$-s|=~?Wc=9>`QdK#4!;dBSfXCzJAUjw2CUMW?=bA7> z7Lnn)CNp4E^PG}O0kL%Dnp`nnS(-+2c%sEPM)8S_Rx2B)g|;`4gyXasol()RPa?Z6 zz{goS`{sAc0kIemS9;2U!D3)=J=9q^2R!*hAhaIZcI)Vkqs!OIgU5@5$M2?>j;@6+ zZr%F1%hBfB5FAYv!dC1^{Ei?HsW@S&@uo0qL)_uWvv4SQXp{3?4i2sOTNO<-PtQ|+(QQvnFQNp^2lVhs7vSle-qa|4Eu&S$qSN(kfmVHHBMd9KhK57e* z+%yVY-YmcdBeAkwyKY7g@H&Fn5sw-WYEhs}lofk~1tU@M8PihWQ7Us7!tL}Nq(?y}qP-cOoIwf7Je!4qVbM>cHV59& z42PbdX`o+{I_qI_lTC6Ry*im0a3EDLBK55$q$8mZ0 zPjaR{S#Jy7^4;()Kk?(h{aW$!7NMgD?r2-6g074wOO>Zz6WfoqZXR5^wL)FYQ9_&r&*&$DL~=`=goohT-5WT zTelAjx*ct>-c|Qhv$Y5KVq18mahJ7^DUwm1YCL+V`c6NC157ppg)C8Lg5n_(Q#2i>C7}i&!+hb18r@&f&lS%2rkEy?kldgL zCz0*OK|yu_mkYhDwQr?cEz&FqH(d}{MA+#dv%-@OI3%`l;FuFT)uGSq(E|oNdSWmu zX3}(wUOQjUvJ=Ko9bPrq+|gqh4bN-(iD-5BkFf}U$ovdqu+}^Bv;1oR`!Bxx;`@`! zlRwM%xCRYCDEjUt0?uT=O4< zRenD(@bkSNIc}f%g|GD3(Y4lN1;@QW@GtUjf9HqaDfW*1G+*i+S)(}Qr|q4EQ=f$Q zmqO!zvv>LVpC7p`{e%2>@|{z^nEQ3;*YaI|dGO*pV@s!gH1@&X!q{49{NE9s%-!qR zwanhX`tH@0+M1Ysq!SHR z0uGqEfkH^68#vycn&59dB1Qc+=`2zP#K?%t|r%!)oTw0 zd^Wmvx<>DGJQVQx;MXxy9+Tzg zZnzM`d(8YNjT7uW%p}g#3Ar{LHk<7e;oN@-1OF!MfQqy?e`&YdhBpLc4;-j&Q1wgS PgxwbW0#9N#b7%h#2{9Vj diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc deleted file mode 100644 index ae87773ca68e75824b82e22934aa030ae1b091c4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3386 zcmb7GU2Gf25#Hk+$rD9NhG@&OOiD*Zg2bvNZC9?KL^b@QII;yv4jDQsq~?4gm()q* z9lLi_BBKTLR;n*`gBX_D08-EbQQuOuK;@!#btldb=%oExEkvWeZC^}+6K7~Ds3L{OZIJH|M|S=2!s z7Ucbxp}iX@ZS8i6~l1&q;=o%V$+BTa?Mg% zHkeTov}KB;2?G-&iVGR6;YFt0SXhdVU}M45^U-U13P*1h(rHXah`NZQd25_iXQO%& zD_V3GYstBsLKdRfevk%ytUc7)ev$kV@xq3*^LTeg>@&51kkJm^SIcNy^sTr{V%zG; zjUPS}2dV+7D10QI*#$15E`rSh(UKgS)td;^@^c#cBYqZZe!f^2)wl!~V=g$-Sa@En zdtP%S9DQ}Z(v8rx%NCBYosP6vo ze0|MSuWj@HzZT9QeO+V1ZELitgvOPczn`-L(hozvsq^uMI`vpc$X?(v_mSXga?j!I ztu-7Hyb?`Fqj2K{b?1~kLm-N%zz~4?4&?#D!s)IMfh<5d8Nl6@CpfLXMLkSR0n9NC zDzY)?F~WYr0*U&twkYe1m=HCE4a#S*NgcXDg}h?UQKxFK4S7{F3A;_|&Z>sVK8aXM z8E>i}!)LJuFkz)E>Z&0n6-}JWr`0S5s~1viLKu=07LL?iPfS#p*HtatKv-Z>9xNg3 zS(3AC%B88OWV13$YOsQ(LY$(`G(%lt!B+!_F=S1vmYIq}7Zs{w!|JL04?(BMIEwi3m%Wb|==@=+=3{?HC z#hd?XJG(lvKDIWtKDjozdHsW{<-lMi@KP!8($?gkUM&YE%Wb!czU|J?`rV)3-RynV zd78jc;o>XXQeQevU?mP#V%=vD0`zFJ;ZA`fZ@Qi$ z5Q-g4^z|h%_tCWgfXC{zfpAYZMCbG#z{ISI^RO-p;0}MXW-M_HL0DR)!yZ3_*e}da zyasXSQ!|=QunaM!6C>=h9z9`jM_ynZq&Uc{#$3Tvvk*8rlVBX?tROI08k4NIOx?*W zWc@;(c0A`j*ZKXh#2AHc8C4sbD~$p8|ElHv)9;;rKlom7)2u`=B?=Q9YZw);4Uam^r#gUJ^!5t^+y2?Fi3=oD`U=WMda1*P8Sj<5kBFbM^|qG?d1vNG2IzEz|MDK%+Y zF-wM+=qV_dm`GAqOp~ay1rviT_0@~1BnCJDV5>zx^VH>9)U$elu#Q2D@lEKy`Z20@ zMwf3C-&*BYepD8sUkDCoXh-xr{j27e2*{V;;GI1?2s-Py*7C52x@9?~C&57Ly24EH zAnTZusSC>TyyFr x!Y+{=%R0gi&vD#m4xSt3c0Gvme2je7zxl7|>?bJpxvP`wc@jMT1p;bm`yZi5`9A;v diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc deleted file mode 100644 index 1786e70901f1ae0130b6edeca7ba39c583906961..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2580 zcmZt|TWk|YaPQ&sBTgL%2_&K9NJGIu>_QSD5<)~FJVHUpCnF?Mj*D;YTshyF-E$yR zmH2D;B9V%4q)J5nN~KDrRjcyNSNm05BVny*gV)-Btw~&Si(gaOBDG&j$bW$9bh@^6vd{Q0{kbsId zp&q0KI+3PK1O}y=JyCLEJXjP3gpL>UhLget+n5=fHteF|xkUZ_`e!QfWoq6vvARpm zxQ<6$Tc>75cO83Xx9U1r1&QqzJ+**IWYTg}_<2)U%^L;Xw8(BXj}1q4vwXh~7c9zs zSx(kfO)KNkL?m*;nZm^KsBdQCtWmVRhF9Bl3u>0QdG%r=w2Q-Vusx%?)0hy;#58hI zr&DgxHg)Q`g^Q}^svahJ%P~9*su`?eP@PeEcvA*d3&b^x8EpFT^H>s3c}`Bp1r|DN z8Z=neUc}GOl1R`xinn>h{z=XmtX(Fge+W~9xL})TH(kQhW-w>bDSkhC?J`){n<-%3G4hxx1bao|WZ()L z5CV!vxcmi3p*o;4G)a@SfTn1{gyINkA+=h6#W0OhV#7y+NY!htA%U+6y@v)@bO%>F zLA8a`!Z2{gO@!ctH`ztBrn;RYro|mAK8^GtF^Q=DU%C^H^l-L zg<-%X8#^p$Th!xR2snaLE}S<=qGZWAm_$Wj@w3cxbSG{uA_aS;N+)BLOG zrJ3XrOfNyZClAAyC(jhKSxk-)YZ@mDK5bi*NjC$-Cpn3o%v9bWmy$dS{&b_s)jgFc z%&=zoT)i<-SkIfe^H8+Vi+~`*-jEF#hfMy(f zzgK}Z!*g8+-i!VO`4Na)==@5L`xPwkO4dZa*6ZAnU)M-+`!&F)P`&>fwt?4ewZJRh zm2+c#@D^%HANWfveX!{S&vO+YUe1BEGV~QfBSL*1nD9+1I=7EYdr>@i+OIC?nGALe zV!1lhVh&0(;&%!4aDn=bMWCki$5 z!zK&Vob2T4yZFLAv2fnF3hxrQr+l{YisnGx<(tvf4;S0|#-dxT(gQ}+f#o`Sp`flU zC3A6b;nIVh!{sdpDxD*pfZ2sfL zq4LH=r8PMlsMDNI;!iJi-yf#CE#MZp0x<|Y|FuptLr2T=7iS-lc-1d zzTr5o=MNeR1~UIIK=^cI@}%q9WDh6XbjTa%WOT#xh&5UCFesA&?yR3LJcWdptH1Es zPkHh`Mb+-U>nCQfmZTe>ROG&Aa!a7KPw{y`kq+;9)^APefc>1N;n z80Ls6@XT6Ht`>E?F;Wu#NJ%tCO5){6NlY3rrg$gSVtO@B7CKPN)8L diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc deleted file mode 100644 index 2bc93b735098c57ffe89fb4a697cd6da16b85db1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9406 zcmb7KTWlLwdLG{I;$5UJmPR*A)P<5_%h%Xm+p;5Bwj6Uju~W%A9+4x8Yifp>A!RAq z-586-F0&{QX`9p(g4POJAX=agD%uC-p@`9 zBX3LUBhK7E=rHU`E}cygafYS&6n9~XPUq+>$K$`b_W_rp2?)?l_4R!l4kHlG~B%w8K@vc?ew@>2S62tSU1oVHK5*D~zIzb6pE^{02i_;l=ZpB_5 zZ=B1q*~lQ2OR=eJicV8JBQRMgNee4%97YB)d>+K54oejZjDhEJ$%RY~>e_Epi}cDI zqt>L-LUs|VKPQ#T#Pus`Q_vNf@NFo}CK#T|(1HM^k%Z0<2{2^)h1V|Qzxm0}@%j11 z43oqcGl%g+ZZQ*yK1Rzt9ns0g*FkRZwVqEy?Kfy1!nW%4JK~WW>+i?MUg%zKP{d4W>m%E; zD;b7j=|x61^Gr6!vvJtbRfpz>w*S*F!21v*u~;>0FA%F7ki8U`r)~rr)2k_U^Yw&5 z%V%{n+HplaXfoZ`T-_GFtp_7v(s~%H%!$?C^bzK$z8i~Jre(tt6PGP>bo>tN71{QB zZhoHOFR)9pmCY?u47(&eM)e|kz5zbG2ZfEYGt1K{mWp%id@3m$l1x@Mq}Z%%R#PGd z*(fmS`Mo*<5)Y2!GBZpHaNviTT#{j#C3GN!J1fKEjBp1I&+vIJ!3?W|P!9KSMj22# zH8;$~;UXBGW7zn@BF*0!MrT&JWQ5`8*E5<~kzFt?s(M1@?)uvy`dq013{iLkynEPJ zzQ&TTOZ0V>e7&Ntchj`VOTJf1z9XXV$kxik0m(O=x9s>ESF`uuS$pUHyKC=msDagyy4e0L zh^m5vjZxW7YFzbW)s3^4N|^Sxp0@BfM(SYE&C@(O1w3l$BCM1R36fd_!ONa3!!M>- z<>FLR0z7k<<`WbT7YetSdRxho4X`hura6~^OP1k-s5w7)uuB?IxE(wJwU6U`2tKk4 zD*PVH@m#59)W~7*U^$wa%Xmk*JMzS0bldZWT{~uTzHfiezIyon$+eUB&#ax<7~AZW z{6mkd(X!i@xAUlWeE*rWK!S3LmmvVBxW0h(^-me0uRTRbnDqod2CXBPgZ@|pj%k0a zku-qt4b$jkFp0SH+ z1U`aF4TD#^p57NUz1;V*C9TB-1E@U)ynEQcTQN_tVA}A0WG_2?JIx0QiH))KI}f;Z zt{6S@&?~i_mzu}(?y{q~a7}WwZH$RsC$?#+>rC0{UG01CYRTClIy)XYyPk{x(%H3Z z!n(hsD>~Z#j}4HAF?_gxk$?Z2YrjgXvxw-9d~r+6pyRDLQm@^2r18XGU>F=aBOE$( z$;mFH)3gN?lxr|zRM)F|hsg$5*vgVAizK^K>^vucvP-9#c$P`<=u(o+SvZqSws!TL z`>~+h^|FQA2N;h)Yr-+`?qNIL!0Pq;Gix*VsWocjjm?-894Q44i^0QN)I(YdPDtL# zym`m%TRkJWyEc|Y_fXL~^x`r&pf%;L`vdS*#WIDlaB2|vXHBG`-pz$nk0w~wOqvPf ziMw#dVCxTq5aE<6Cx*4`6LtM|%-RB+s-FJ@T*q+g>X*0FDG$8@8L?_Pgl!1s#9B#f z%m#C-aX?0ZjvUv&vYnjw8THnR zbno4ZtOsKL>Wl&~qaf*4X5=MJF?1W%xERvOHI8%+>xQvm9qAg@5sgGh`3oPnu)dil zttjaunlw3TH+ur4NcyfekiHmFxiyZ^AqlYlnFCq|=_i_rkVYrjzzo6)iX#J>q(BCM zCfFe9k9BCddtcH^28kB5+E?JvI?%qtVqJUrT8Q+K{+nH-hqR7W&@|f!qwLZ0_rCk| zLPHEKPmLoRwUI%eVx`XpxL8_q{{}^`?on67Q}Lw~a~Bv`I*s22E}3E#eg%}0!)aWA zDA3PDiU+oNE|*FtfC>Y-WQ2uE-cW`8f!fCf4zhrj;J_$>mN0lqzy;>59K#|OX-6av zbPA990Hu+ED+-7vBF`uh8qVS)3IpF`eigp}hp?IK^}9B5ES4YH8~N zLnqQOX4z30jDl}%LJIDGMGIp*(v>_qt*Ra{t20o5mL1LI4glYzut#BJ2htuDgN1CL zqqFe^N=UuUfN}wvq)?+@=wu73n@TK40s%EN6id zMnWp2N9SEOXSj4KzM^6m#FY3O$Wxf%Ii)W!-=Ov8CsE9y&Pd%AicBk0w4z}maw&T& zGedJ$1uom*X4#-+JM=berpW_PrTo^)>mv9*d!(2p}eK+@U1eE zqowQ*<|n`MhYEJl-<7xS_z+cGURy5Who#V1DKxts_$>TsxOncmbex1}@y%JucdK}t z5q%7_Vs*dode2ozY+ld1MC&V+ZVJaYl9De9`2(+PC8dE=Vq<2A<^3R zRiH6%D+j`*K+mH}(ilM>H>szBR zT1vi7(buW+Tk%Of9AtF`Zf)l zZDazj z^z=%eGfxayW6!P`3pT;1#X!$SeDlcGk%!^Wk4ga|f1&Jjzd!ljJ3{aE>P!t9z3K>GN zDLrcgHlpws!l39nJMhAazJa<`7qeD_)j&l%I%cY<12mblnq3XVs+gjh1U9rif?qe^ zM3^;Qt!XdT2?(cNTUGqB53#i&1h#KvM#75bSGPWxgJ0;&HB&`5u_~6?8-FLJXd)UqtM2X z0~Eu0Ss#>Y*~`^JTB+x8OjW7@(n)%Tzzk0Ykk1D9Fb!cQgPLSWeYz?U(yieq`XJPE z6AneTU0RQOU#)LIIc~!F?WJaVw7R5A8=)U$FmVY+<^%j(2j6IY=n~~(HGOm5aA*kj_Y%98rd+8<`Iq^ zda;}~k9wxFYMvI@R~=Rzk+2$9@5E|SV8aAFrw~=^E}WF(TJ;+tkRh_H)VLVZA2m*s zNp&}z1X)>PXKIQ;^hWq4qx{Z`WnWSfY3LeP#o?&+3pJS#y%E-CoP({$dU4nxNdEhF zSuJXBBK5J?UTleMB29#QKWcwXcjbzTrC-t-oLv^RhOk79Bbx}%aUGy>xGxk2;DNIT zfMD;`)38^d2RQlPdb>1ebA(l#(mEo%&2aC;V1%`f@M-1!)jLc}>Bc&+K}=iCgbu^z z^*Gk9NbNP1egyIt)TU>g6$@vSRS)KX)7{oc(}?&wmI=8~muj zqkG%b9|fin>6J}@;Nfw=1W-A?pkP$Guw^G5 zNAI4LIv0Bi7$-0u7Ut!uKEy&kOlF$n7ok~WJe|sb)r-3;8y8Z^1u#&J`?!H!wTB^8&nMB$kQ&)TRvyVc-*}pO+f+_H9#yCWXI*b8-*+ zm9_a-)&L?3-r#C-?Lc8!^!DY=7@X*#-F?b|z-wC%A#ve?rPd&K0 zes%NW);X#5XsPw2*m`pN>gU&_)~izZ^}Gu(`u&NuiQ<7_DKJtBP=9@X+qg}B>iEq4 zsk=DwW^v}0bnLs*VJdI?I?#B3^5>I(G?lmQSncmye`Z|`J+e0K>RW8Mg7hO|AhOxN zJud~O04;ZR6`k#6U(crNk#DRVZhPQdcYf5cW?I#+o-Ma_mRbkJ)}X@Qm1c<{{)wL&ng^mUnAX8almdf8^=^7k{`MXe#urT`dJrUylLQv0cbvE1EX>OLxVAB71s zQup|}tzh2Jm3v1@y~o7fW2N4UV(-O=OT}AsvG-!JcW&K@@;64y9lf74f86|Mt)-4b zV#gt=W31G1O6)kbJzqL~O+0(OsIV4i>4 z{$I~ptL1;`Z2wOIfy2Kuor_*Ri2ePcbHU32)92RnK8Sr0uwU-7ebHt`;e+AJrw#lM zq3m=dC|fCo;8T=rrz$oK2sh;v zKFzWm*eTG}1o?`l3Zp_D$Qwi+LT<8gj^on&DJ7UjKP<_jEak2SjgP6PG)O78Zz@h9 z!-w+I6Vxagu}b(5f#5Bghn_@y5LyH1PA zkHYZzU8B+D`<>Hf>fUYCo1)v}&oKDxI!8@STM&lN?!2zwbnKz9i@~QzzNvn{JNFHW zt7A~QM_oTgD)liMJv>Grz#$vvm?hZ(@8JpVF5Kj)MR=iSmMAccMC`JKq7qyjrt?!& z6bxnA1+Xxw79}(Mk^!zdRkD$)%}KSHMXPV;QKV1QxAW*64f+qjT}2+ZQ0RF<$<|j@ zyUl6-Zy{F){{mWoU4u@id!o1K4m|Z?I{UA&@UO9!f5px`Gf(Js#|m#g!_as4rV;b` W%MNGRA1J#$Pi&^3Uc|hL4E`5tJ&=$9 diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc deleted file mode 100644 index 474555509212136bcdfd777fcb79cb65cbb4d07f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2709 zcma)8OKcm*8J^|7Na|V2acW0xSF$Q;VTp_c(?MQ!Q#nQ~C34IS?KT#&STBd-R?Ast zW>&TW1n@2LDK_B1I{MIqdP{%;=`lqwO;Mm?89>6ohqOge_*4gqQRI~Vvm|Ai%^?F~ zXa1T0@y+}%?hg!P5sb0leH;A8KBpg4xkl(x+06ejNQ;qZy#G zKp6?u=a{sAHEn0a;FAykU=Rd_el232FEV_ifnDO&u`noxdlMhre=s1*kOZ|K!Uv)X zBFa%6&CwkruM?%Lln!%u#LB3Y>MRdUrEY{C+H=fjQL)2Y>}ax3S(!j~dRYwk5#it$ zw+(gJ2iJ4#LcqE3l0oHxl%P`loUM!gE_XeuV37$(z}@yzMpb(`3=XOX#N$Ck+B-)m zzUI-2OCc_zdhlLyBvee+ZB-a?qXHI%0jjIPs!&%k2S;iQhgJF%*k#hVWr>wSnPJKb zIEJji>(Vw`+bEpHY>h{a!U7;Fyct)kn4YDj1Q{5^`FK5}tE-gjIRMg8B!WX$%w8)_I>9JIrDt^Y9+J*T>9SdR zx`!3WF82t!l)jW<>a!4eBj~jWgktbW<7PR1O=bZ(eb1*Rz?y}aN-`u}MRxykl5@5~r5E3VD%VfqI;Mf| zMkL%2M}Ts@q&>J$W!gc;MPPjZ8L`>v572d65?!kUUu{iqw zdoJS@p#>b*Atg=V1prJKRSLC?*@NQ5L6YT@B+DNpSzZ=|q|GEZ5a|w$g;s}b6ef?{ zO5k&$SHsBTLS=DX=mE5p>rfIBxI$+&uYutRK%CLZOl{v From 3f28b08e02ecbaa0154af9cb86befc86cf89bead Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 18:51:45 -0400 Subject: [PATCH 09/26] tests: add 3-level inheritance + self-cycle cases for EvalManager parser Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_eval_manager.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py index 064463cc7..0b2356a41 100644 --- a/tests/test_eval_manager.py +++ b/tests/test_eval_manager.py @@ -47,6 +47,26 @@ def test_inheritance_chain(): assert cfg["env"]["map_dir"] == "/tmp/hard_stop" +def test_inheritance_three_levels(): + # C inherits B inherits A. Each level overrides the one above. + sections = { + "A": {"interval": 100, "env.scenario_length": 91, "env.map_dir": "/A"}, + "B": {"inherits": "A", "interval": 200, "env.scenario_length": 201}, + "C": {"inherits": "B", "env.map_dir": "/C", "render": True}, + } + cfg = _build_section_config("C", sections["C"], sections) + assert cfg["interval"] == 200, "B should win over A on interval" + assert cfg["env"]["scenario_length"] == 201, "B should win over A on scenario_length" + assert cfg["env"]["map_dir"] == "/C", "C should win over A and B on map_dir" + assert cfg["render"] is True, "C's own field" + + +def test_inheritance_self_cycle_detected(): + sections = {"a": {"inherits": "a"}} + with pytest.raises(ValueError, match="Cyclic"): + _build_section_config("a", sections["a"], sections) + + def test_inheritance_child_wins(): sections = { "parent": {"interval": 250, "env.scenario_length": 201}, From 8b3519458d6053b4663c707e91b385539a6c0f99 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 23:37:33 -0400 Subject: [PATCH 10/26] [WIP] eval: render budget knob + random scenario selection per epoch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - eval.render_num_scenarios: explicit per-evaluator render budget; defaults to min(eval.num_scenarios, 3). Renders are expensive (mp4 encode + wandb upload) and shouldn't run at metric scale. - Render path randomizes starting_map per epoch when not pinned, so successive renders show different bins from the dir instead of the first N alphabetically. Restores the old behavior from _render_driving_behaviours that the refactor lost. - behaviors_defaults pins render_num_scenarios = 2 so 12 classes × 2 views × 2 scenarios = 48 mp4s/epoch (vs 1200 if it inherited num_scenarios=50). - Test: render_num_scenarios is inheritable. "Worst N" selection deferred — needs per-scenario scores piped out of the metric rollout into the render call. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 4 +++ .../benchmark/evaluators/multi_scenario.py | 30 ++++++++++++++++--- tests/test_eval_manager.py | 20 +++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index d6d62a025..ebe977e02 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -258,6 +258,10 @@ env.init_mode = "create_all_valid" env.scenario_length = 201 env.max_partner_observations = 32 eval.num_scenarios = 50 +; Render budget per epoch (metrics still use the full num_scenarios). +; Defaults to min(num_scenarios, 3); pin lower if even 3 mp4s × 12 classes +; × 2 views is too much wandb traffic. +eval.render_num_scenarios = 2 [eval.behaviors_full_dir] inherits = "behaviors_defaults" diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index 0703e6567..bd6c5672a 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -60,11 +60,19 @@ def _should_stop(self, args, infos_collected, steps) -> bool: def _render_pass(self, vecenv, policy, args) -> list: """One rollout per view, all writing mp4s to a single dir. - Builds a fresh single-worker env so frame capture is sequential - and starting_map_counter starts at 0 — the C-side ffmpeg-per-env - wiring assumes one bin at a time per process. + Builds a fresh single-worker env per view (C-side ffmpeg-per-env + wiring assumes one bin at a time per process). Render budget and + starting position are independent of the metric pass: + + eval.render_num_scenarios — how many scenarios to render. Defaults + to min(eval.num_scenarios, 3). Always respected over + num_scenarios so renders stay cheap. + starting_map — randomized per render epoch so successive epochs + show different scenarios from the dir, not the same first-N + alphabetically. Set explicitly in env.* to pin. """ import importlib + import random import pufferlib @@ -84,6 +92,15 @@ def _render_pass(self, vecenv, policy, args) -> list: render_env_kwargs = dict(args["env"]) render_env_kwargs["render_mode"] = "headless" + # Random starting map per render epoch — every epoch shows a + # different bin from the directory rather than the first N + # alphabetically. The user can pin by setting env.starting_map + # explicitly in the [eval.] section. + if "starting_map" not in self.config.get("env", {}): + num_maps = int(render_env_kwargs.get("num_maps", 1)) + if num_maps > 1: + render_env_kwargs["starting_map"] = random.randint(0, num_maps - 1) + all_paths = [] for view in self.render_views: view_idx = _VIEW_NAME_TO_IDX.get(view, 0) @@ -116,7 +133,12 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: device = args["train"]["device"] num_agents = vecenv.observation_space.shape[0] - num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1)) + # Render budget defaults to min(num_scenarios, 3) if not set explicitly. + # Renders are expensive (mp4 encode + wandb upload) so we don't want + # them at metric-pass scale. + eval_cfg = self.config.get("eval", {}) + metric_count = int(eval_cfg.get("num_scenarios", 1)) + num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3))) max_steps = args.get("render_max_steps") or int(args["env"].get("scenario_length", 91)) saved_cwd = os.getcwd() diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py index 0b2356a41..b4ab620b7 100644 --- a/tests/test_eval_manager.py +++ b/tests/test_eval_manager.py @@ -136,6 +136,26 @@ def test_manager_from_config_skips_template_sections(): assert "behaviors_defaults" not in names # template, no `type` field +def test_render_num_scenarios_inheritable(): + # Behavior-style template specifies a small render budget; the per-class + # section inherits it without re-declaring. + sections = { + "defaults": { + "type": "behavior_class", + "interval": 250, + "eval.num_scenarios": 50, + "eval.render_num_scenarios": 2, + }, + "hard_stop": { + "inherits": "defaults", + "env.map_dir": "/tmp/hard_stop", + }, + } + cfg = _build_section_config("hard_stop", sections["hard_stop"], sections) + assert cfg["eval"]["num_scenarios"] == 50 + assert cfg["eval"]["render_num_scenarios"] == 2 + + def test_manager_unknown_type_raises(): train_config = {"eval": {"foo": {"type": "totally_made_up"}}} with pytest.raises(ValueError, match="not registered"): From 1537f566330a703133a00be92f9cca1da8768898 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sat, 9 May 2026 23:47:01 -0400 Subject: [PATCH 11/26] [WIP] eval: subprocess evals see fresh checkpoint, not stale resume path Two coupled fixes: EvalManager: - Accepts run_id at construction. Used to locate the per-run models/ directory. - latest_checkpoint(env_name) walks data_dir/_/models/ for the newest model_*.pt. Falls back to train_config.load_model_path if no checkpoints exist yet. - has_subprocess_evals_at(epoch) reports whether any enabled subprocess evaluator would fire at that epoch. - _run_subprocess uses latest_checkpoint instead of train_config.load_model_path. PuffeRL.evaluate: - Calls save_checkpoint() before maybe_run() if any subprocess evaluator would fire. Mirrors the old run_driving_behaviours _eval_in_subprocess flow. - train() passes logger.run_id when constructing the manager. For all-inline configs (today's drive.ini default) this is a no-op. Activates when an evaluator is flipped to mode=subprocess. Tests: latest_checkpoint picks the newest by ctime, falls back to load_model_path when no models exist; has_subprocess_evals_at fires only on enabled subprocess evaluators at matching intervals. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/benchmark/manager.py | 43 ++++++++++++++++++++++---- pufferlib/pufferl.py | 7 ++++- tests/test_eval_manager.py | 46 ++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 7 deletions(-) diff --git a/pufferlib/ocean/benchmark/manager.py b/pufferlib/ocean/benchmark/manager.py index 86dedaacc..9e79a50bc 100644 --- a/pufferlib/ocean/benchmark/manager.py +++ b/pufferlib/ocean/benchmark/manager.py @@ -20,6 +20,7 @@ """ import copy +import glob import importlib import json import os @@ -45,12 +46,15 @@ class EvalManager: - def __init__(self, evaluators: list, train_config: dict): + def __init__(self, evaluators: list, train_config: dict, run_id: str = None): self.evaluators = evaluators self.train_config = train_config + # `run_id` is needed to resolve the latest checkpoint for subprocess + # evals. None is fine if no evaluator is mode=subprocess. + self.run_id = run_id @classmethod - def from_config(cls, train_config: dict) -> "EvalManager": + def from_config(cls, train_config: dict, run_id: str = None) -> "EvalManager": sections = _discover_eval_sections(train_config) evaluators = [] for name, raw in sections.items(): @@ -66,7 +70,31 @@ def from_config(cls, train_config: dict) -> "EvalManager": f"Known types: {sorted(EVALUATOR_REGISTRY.keys())}" ) evaluators.append(cls_for_type(name=name, config=cfg, train_config=train_config)) - return cls(evaluators=evaluators, train_config=train_config) + return cls(evaluators=evaluators, train_config=train_config, run_id=run_id) + + def has_subprocess_evals_at(self, epoch: int) -> bool: + """True if any enabled subprocess evaluator would fire at this epoch. + Training loop uses this to decide whether to save_checkpoint() before + calling maybe_run() — subprocesses load the checkpoint from disk.""" + for ev in self.evaluators: + if not ev.enabled or ev.mode != "subprocess" or ev.interval <= 0: + continue + if epoch % ev.interval == 0: + return True + return False + + def latest_checkpoint(self, env_name: str) -> str: + """Return the path to the most recent model_*.pt under the experiment + dir. Falls back to train_config['load_model_path'] if no checkpoints + have been written yet (e.g. resume-from path before first save). + Returns None if neither resolves.""" + if self.run_id and self.train_config.get("data_dir"): + model_dir = os.path.join(self.train_config["data_dir"], f"{env_name}_{self.run_id}", "models") + if os.path.isdir(model_dir): + files = glob.glob(os.path.join(model_dir, "model_*.pt")) + if files: + return max(files, key=os.path.getctime) + return self.train_config.get("load_model_path") def maybe_run(self, epoch: int, policy, env_name: str, logger=None, global_step=None) -> dict: """Called from the training loop. Runs every enabled evaluator @@ -147,9 +175,12 @@ def _run_subprocess(self, ev: Evaluator, env_name: str, global_step) -> EvalResu "--out", str(out_path), ] - # Subprocess inherits the same checkpoint via train_config.load_model_path. - if self.train_config.get("load_model_path"): - cmd += ["--load-model-path", self.train_config["load_model_path"]] + # Subprocess loads the freshest checkpoint on disk. Caller (training + # loop) is responsible for save_checkpoint() before this fires — + # see has_subprocess_evals_at. + ckpt = self.latest_checkpoint(env_name) + if ckpt: + cmd += ["--load-model-path", ckpt] subprocess.run(cmd, check=True) with open(out_path) as f: payload = json.load(f) diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py index 7176c9759..e6b275d22 100644 --- a/pufferlib/pufferl.py +++ b/pufferlib/pufferl.py @@ -462,6 +462,11 @@ def train(self): # the manager fires any whose interval divides this epoch. See # docs/eval_unification.md for the design. if self._eval_manager is not None: + # Subprocess evals load the policy from disk. Save the latest + # checkpoint first so they see this epoch's weights, not the + # last save_checkpoint() from `checkpoint_interval`. + if self._eval_manager.has_subprocess_evals_at(self.epoch): + self.save_checkpoint() self._eval_manager.maybe_run( epoch=self.epoch, policy=self.uncompiled_policy, @@ -1376,7 +1381,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, early_stop from pufferlib.ocean.benchmark.manager import EvalManager - pufferl._eval_manager = EvalManager.from_config(args) + pufferl._eval_manager = EvalManager.from_config(args, run_id=logger.run_id if logger else None) # Restore optimizer state + step counters when resuming from a checkpoint. # save_checkpoint writes models/model__.pt and trainer_state.pt diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py index b4ab620b7..f009901df 100644 --- a/tests/test_eval_manager.py +++ b/tests/test_eval_manager.py @@ -160,3 +160,49 @@ def test_manager_unknown_type_raises(): train_config = {"eval": {"foo": {"type": "totally_made_up"}}} with pytest.raises(ValueError, match="not registered"): EvalManager.from_config(train_config) + + +def test_has_subprocess_evals_at(): + train_config = { + "eval": { + "inline_one": {"type": "human_replay", "interval": 25, "mode": "inline"}, + "subprocess_one": {"type": "human_replay", "interval": 100, "mode": "subprocess"}, + "subprocess_disabled": { + "type": "human_replay", + "interval": 100, + "mode": "subprocess", + "enabled": False, + }, + } + } + mgr = EvalManager.from_config(train_config) + assert mgr.has_subprocess_evals_at(epoch=100) is True # subprocess_one fires + assert mgr.has_subprocess_evals_at(epoch=25) is False # only inline at 25 + assert mgr.has_subprocess_evals_at(epoch=50) is False # nothing at 50 + + +def test_latest_checkpoint_finds_newest_pt(tmp_path): + import time + + model_dir = tmp_path / "puffer_drive_run123" / "models" + model_dir.mkdir(parents=True) + p_old = model_dir / "model_puffer_drive_001.pt" + p_old.write_text("a") + time.sleep(0.05) + p_new = model_dir / "model_puffer_drive_002.pt" + p_new.write_text("b") + + train_config = {"data_dir": str(tmp_path), "eval": {}} + mgr = EvalManager.from_config(train_config, run_id="run123") + assert mgr.latest_checkpoint("puffer_drive") == str(p_new) + + +def test_latest_checkpoint_falls_back_to_load_model_path(tmp_path): + train_config = { + "data_dir": str(tmp_path), + "load_model_path": "/some/resume/path.pt", + "eval": {}, + } + mgr = EvalManager.from_config(train_config, run_id="run123") + # No models dir exists → falls back to load_model_path + assert mgr.latest_checkpoint("puffer_drive") == "/some/resume/path.pt" From 7af739a04960c2a87f422b2333077aeaac136766 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 00:06:26 -0400 Subject: [PATCH 12/26] =?UTF-8?q?[WIP]=20eval:=20revert=20goal=5Fadvance?= =?UTF-8?q?=5Fmode=20C=20knob=20=E2=80=94=20defer=20to=20a=20separate=20PR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The unified-eval refactor doesn't actually need this knob, and pushing it through the C struct + binding + drive.py at the same time bloats this PR. Restoring the original `if (env->simulation_mode == SIMULATION_REPLAY)` branch in c_step. The "promote implicit branches to explicit knobs" audit is real work but lives in its own PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/drive/binding.c | 1 - pufferlib/ocean/drive/drive.h | 20 ++++---------------- pufferlib/ocean/drive/drive.py | 15 --------------- 3 files changed, 4 insertions(+), 32 deletions(-) diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c index 0dd93a5e5..b2a4c20b1 100644 --- a/pufferlib/ocean/drive/binding.c +++ b/pufferlib/ocean/drive/binding.c @@ -1788,7 +1788,6 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) { env->init_mode = (int)unpack(kwargs, "init_mode"); env->control_mode = (int)unpack(kwargs, "control_mode"); env->simulation_mode = (int)unpack(kwargs, "simulation_mode"); - env->goal_advance_mode = (int)unpack(kwargs, "goal_advance_mode"); env->reward_conditioning = (bool)unpack(kwargs, "reward_conditioning"); env->reward_randomization = (bool)unpack(kwargs, "reward_randomization"); env->compute_eval_metrics = (bool)unpack(kwargs, "compute_eval_metrics"); diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h index 701eb1b9f..17ee20987 100644 --- a/pufferlib/ocean/drive/drive.h +++ b/pufferlib/ocean/drive/drive.h @@ -64,16 +64,6 @@ #define SIMULATION_GIGAFLOW 0 #define SIMULATION_REPLAY 1 -// Goal advance modes — chosen when the SDC reaches the last goal in its -// sequence. REGENERATE recomputes a fresh set along the route (the -// gigaflow training pattern). SATURATE leaves the goal queue at its -// final state so the reached-goal condition won't fire again (the -// replay-mode pattern, where regenerating would dereference NULL paths -// for nuPlan bins without route info). Defaults to REGENERATE for -// gigaflow and SATURATE for replay; the Python config layer chooses. -#define GOAL_ADVANCE_REGENERATE 0 -#define GOAL_ADVANCE_SATURATE 1 - // Lane selection scoring #define LANE_SELECTION_DISTANCE_WEIGHT 0.7f #define LANE_SELECTION_HEADING_WEIGHT 0.3f @@ -346,7 +336,6 @@ struct Drive { int init_mode; int control_mode; int simulation_mode; - int goal_advance_mode; int termination_mode; float inactive_agent_threshold; int reward_conditioning; @@ -4877,11 +4866,10 @@ void c_step(Drive *env) { if (agent->current_goal_idx == env->num_target_waypoints) { // Last goal reached env->logs[i].num_goals_reached += 1; - if (env->goal_advance_mode == GOAL_ADVANCE_SATURATE) { - // Leave current_goal_idx saturated so the reached-goal - // condition won't fire again. Used by replay evals where - // regenerating route-based goals on WOMD/nuPlan bins - // would fail (path NULL or removed=1). + if (env->simulation_mode == SIMULATION_REPLAY) { + // Replay mode: leave current_goal_idx saturated so the + // reached-goal condition won't fire again. Re-generating + // route-based goals on WOMD maps fails (removed=1). } else { compute_goals(env, agent_idx); } diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 1d9c4d081..7dc537ca5 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -57,7 +57,6 @@ def __init__( action_type="discrete", dynamics_model="classic", simulation_mode="gigaflow", - goal_advance_mode=None, termination_mode=0, inactive_agent_threshold=0.4, buf=None, @@ -230,19 +229,6 @@ def __init__( else: raise ValueError(f"simulation_mode must be one of 'gigaflow' or 'replay'. Got: {self.simulation_mode_str}") - # goal_advance_mode controls what happens when the SDC reaches the - # last goal in its sequence. None → auto-pick based on simulation_mode - # (gigaflow=regenerate, replay=saturate). Explicit values: "regenerate" - # or "saturate". - if goal_advance_mode is None: - self.goal_advance_mode = 1 if self.simulation_mode == 1 else 0 - elif goal_advance_mode == "regenerate": - self.goal_advance_mode = 0 - elif goal_advance_mode == "saturate": - self.goal_advance_mode = 1 - else: - raise ValueError(f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}") - if self.control_mode_str == "control_vehicles": self.control_mode = 0 elif self.control_mode_str == "control_agents": @@ -401,7 +387,6 @@ def _env_init_kwargs(self, map_file, max_agents): "init_mode": self.init_mode, "control_mode": self.control_mode, "simulation_mode": self.simulation_mode, - "goal_advance_mode": self.goal_advance_mode, "reward_conditioning": self.reward_conditioning, "reward_randomization": self.reward_randomization, "compute_eval_metrics": self.compute_eval_metrics, From 1153abefcc043c4e074c7ca83312fb4c05f6f4ba Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 00:08:00 -0400 Subject: [PATCH 13/26] =?UTF-8?q?[WIP]=20eval:=20tier-A=20tests=20?= =?UTF-8?q?=E2=80=94=20dispatch,=20info-shape,=20behavior=20cleanup,=20ove?= =?UTF-8?q?rride=20stack?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 4 tests covering the regression-prone surface the parser-only tests miss: test_maybe_run_dispatches_by_interval_and_enabled Stubs _run_one and verifies that maybe_run fires only enabled evaluators whose interval divides epoch. epoch=33 fires nothing, epoch=250 fires both 25-interval and 250-interval evaluators. test_flatten_infos_handles_shape_variations _flatten_infos must handle multi-worker (list of lists) AND PufferEnv (flat list) backends, plus None / empty entries — one bad isinstance check silently drops episode infos. test_behavior_class_cleanup_removes_symlink_dir Builds a real 5-bin map_dir, requests a 2-bin sample, verifies the tmp symlink dir gets created with 2 bins, then cleanup() removes it. tempfile.mkdtemp leftovers are a real footgun. test_eval_args_compose_train_section_and_clean_macro The full override stack: train baseline → section overrides → clean macro. Section beats baseline, explicit beats macro, untouched baseline survives. 25 tests total (was 21). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_eval_manager.py | 128 ++++++++++++++++++++++++++++++++++++- 1 file changed, 125 insertions(+), 3 deletions(-) diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py index f009901df..e9df7daa8 100644 --- a/tests/test_eval_manager.py +++ b/tests/test_eval_manager.py @@ -1,8 +1,9 @@ -"""Smoke tests for EvalManager config parsing. +"""Smoke tests for EvalManager config parsing + dispatch. Doesn't load the full pufferl.py module (which pulls heavy training deps). -Just verifies the inheritance + clean macro + dotted-key expansion logic -behaves as the design doc says. +Verifies parser correctness, dispatch gating, info-flattening shape +handling, behavior-class symlink cleanup, and the train/section/macro +override resolution stack. """ import os @@ -12,6 +13,8 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from pufferlib.ocean.benchmark.evaluators import EvalResult, Evaluator +from pufferlib.ocean.benchmark.evaluators.behavior_class import BehaviorClassEvaluator from pufferlib.ocean.benchmark.manager import ( CLEAN_EVAL_OVERRIDES, EvalManager, @@ -206,3 +209,122 @@ def test_latest_checkpoint_falls_back_to_load_model_path(tmp_path): mgr = EvalManager.from_config(train_config, run_id="run123") # No models dir exists → falls back to load_model_path assert mgr.latest_checkpoint("puffer_drive") == "/some/resume/path.pt" + + +# -- Tier A: dispatch + invariants ----------------------------------------- + + +def test_maybe_run_dispatches_by_interval_and_enabled(monkeypatch): + """maybe_run should fire only enabled evaluators whose interval divides epoch.""" + train_config = { + "eval": { + "fires_at_25": {"type": "human_replay", "interval": 25}, + "fires_at_250": {"type": "human_replay", "interval": 250}, + "disabled": {"type": "human_replay", "interval": 25, "enabled": False}, + "zero_interval": {"type": "human_replay", "interval": 0}, + } + } + mgr = EvalManager.from_config(train_config) + + calls = [] + + def fake_run(ev, *, policy, env_name, logger, global_step): + calls.append(ev.name) + return EvalResult(metrics={}) + + monkeypatch.setattr(mgr, "_run_one", fake_run) + + mgr.maybe_run(epoch=25, policy=None, env_name="puffer_drive") + assert calls == ["fires_at_25"], "only the 25-interval evaluator fires at epoch 25" + calls.clear() + + mgr.maybe_run(epoch=250, policy=None, env_name="puffer_drive") + assert sorted(calls) == ["fires_at_25", "fires_at_250"], "both fire at epoch 250" + calls.clear() + + mgr.maybe_run(epoch=50, policy=None, env_name="puffer_drive") + assert calls == ["fires_at_25"], "only fires_at_25 at epoch 50; nothing else" + calls.clear() + + mgr.maybe_run(epoch=33, policy=None, env_name="puffer_drive") + assert calls == [], "nothing fires when no interval divides the epoch" + + +def test_flatten_infos_handles_shape_variations(): + """_flatten_infos must accept both list-of-list (multi-worker) and + flat-list (PufferEnv) info shapes, plus None / empty entries.""" + + class _Stub(Evaluator): + type_name = "_stub_flatten" + + def _should_stop(self, *args, **kwargs): + return True + + s = _Stub("test", {}, {}) + assert s._flatten_infos(None) == [] + assert s._flatten_infos([]) == [] + assert s._flatten_infos([None, None]) == [] + assert s._flatten_infos([[], []]) == [] + + d1, d2, d3 = {"a": 1}, {"b": 2}, {"c": 3} + # Multi-worker backend: list of per-worker info lists + assert s._flatten_infos([[d1], [d2]]) == [d1, d2] + assert s._flatten_infos([[d1, d2], [d3]]) == [d1, d2, d3] + # PufferEnv backend: flat list of info dicts + assert s._flatten_infos([d1, d2]) == [d1, d2] + + +def test_behavior_class_cleanup_removes_symlink_dir(tmp_path): + """BehaviorClassEvaluator builds a tmp symlink dir when sampling. + cleanup() must remove it; otherwise we accumulate leftovers.""" + map_dir = tmp_path / "bins" + map_dir.mkdir() + for i in range(5): + (map_dir / f"map_{i}.bin").write_text("a") + + config = { + "type": "behavior_class", + "env": {"map_dir": str(map_dir)}, + "eval": {"num_scenarios": 2}, + } + ev = BehaviorClassEvaluator("test_class", config, train_config={}) + + overrides = ev.env_overrides() + sampled = overrides["map_dir"] + assert sampled != str(map_dir), "sampling should redirect to a tmp dir" + assert os.path.isdir(sampled) + assert len([f for f in os.listdir(sampled) if f.endswith(".bin")]) == 2 + + ev.cleanup() + assert not os.path.exists(sampled), "tmp dir should be gone after cleanup" + assert ev._sampled_dir is None + + +def test_eval_args_compose_train_section_and_clean_macro(): + """_build_eval_args must fold train_config['env'] (baseline) + + section overrides + clean macro correctly. Section beats baseline, + explicit beats clean macro, baseline survives when not overridden.""" + train_config = { + "env": { + "lane_segment_dropout": 0.5, # training perturbation + "scenario_length": 91, + "num_agents": 1024, # only present in train baseline + }, + "train": {"seed": 42, "device": "cpu"}, + "eval": { + "validation": { + "type": "multi_scenario", + "interval": 25, + "env.scenario_length": 201, # section overrides baseline + # clean=true (default) → lane_segment_dropout zeroed by macro + # num_agents not specified → falls through to train baseline + }, + }, + } + mgr = EvalManager.from_config(train_config) + ev = mgr.evaluators[0] + args = mgr._build_eval_args(ev, env_name="puffer_drive", global_step=0) + + assert args["env"]["scenario_length"] == 201, "section override wins" + assert args["env"]["lane_segment_dropout"] == 0.0, "clean macro applied" + assert args["env"]["num_agents"] == 1024, "train baseline preserved" From 1cfbcc3658eacebaabe1bbb00d3d6471ae4ba662 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 00:15:45 -0400 Subject: [PATCH 14/26] =?UTF-8?q?drive.ini:=20bump=20validation=5Fgigaflow?= =?UTF-8?q?=20interval=2025=20=E2=86=92=20250?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligns with the heavier-eval cadence the rest of the new eval sections use. 25-epoch interval was too aggressive for inline gigaflow validation given the per-pass setup cost. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index ebe977e02..b49cc1de8 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -216,7 +216,7 @@ render_map = none [eval.validation_gigaflow] type = "multi_scenario" enabled = true -interval = 25 +interval = 250 mode = "inline" clean = true render = false From c2fa176400df91e9dc2fac5adf33e3975c85b88f Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 00:29:38 -0400 Subject: [PATCH 15/26] [WIP] eval: log eval_seconds per evaluator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Time the rollout in Evaluator.rollout (base class) and inject `eval_seconds` into the returned metrics dict. Manager's _log posts it to wandb under {ev.name}/eval_seconds — wall-clock cost per evaluator becomes a first-class panel. Refactor WOSACEvaluator to override _run_rollout_loop instead of rollout — now WOSAC also benefits from the timing without code duplication. Test: stub evaluator with a forced 20ms floor; verifies eval_seconds lands in the result and the inner metrics survive. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/benchmark/evaluators/base.py | 11 ++++++++--- pufferlib/ocean/benchmark/evaluators/wosac.py | 7 +++---- tests/test_eval_manager.py | 19 +++++++++++++++++++ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py index 77a7ad95e..08406b5cd 100644 --- a/pufferlib/ocean/benchmark/evaluators/base.py +++ b/pufferlib/ocean/benchmark/evaluators/base.py @@ -1,5 +1,6 @@ """Evaluator base class + default rollout loop + EvalResult dataclass.""" +import time from dataclasses import dataclass, field from typing import ClassVar @@ -61,12 +62,16 @@ def vec_overrides(self) -> dict: def rollout(self, vecenv, policy, args) -> EvalResult: """Default rollout: reset → step → collect infos → aggregate. - Subclasses tune behavior via the hooks below. Override this - method directly only if the loop shape itself needs to differ - (e.g. per-scene multi-rollout patterns). + Times the inner work and adds `eval_seconds` to metrics so wandb + panels show wall-clock cost per evaluator. Subclasses tune + behavior by overriding `_run_rollout_loop` (and optionally + `_render_pass`); only override this method if the loop shape + itself needs to differ. """ + t0 = time.time() metrics = self._run_rollout_loop(vecenv, policy, args) frames = self._render_pass(vecenv, policy, args) if self.render else [] + metrics["eval_seconds"] = float(time.time() - t0) return EvalResult(metrics=metrics, frames=frames) def _run_rollout_loop(self, vecenv, policy, args) -> dict: diff --git a/pufferlib/ocean/benchmark/evaluators/wosac.py b/pufferlib/ocean/benchmark/evaluators/wosac.py index 8733c8a2a..b8a7d82af 100644 --- a/pufferlib/ocean/benchmark/evaluators/wosac.py +++ b/pufferlib/ocean/benchmark/evaluators/wosac.py @@ -8,7 +8,7 @@ from typing import ClassVar -from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator +from pufferlib.ocean.benchmark.evaluators.base import Evaluator class WOSACEvaluator(Evaluator): @@ -25,7 +25,7 @@ def env_overrides(self) -> dict: env.update(self.config.get("env", {})) return env - def rollout(self, vecenv, policy, args) -> EvalResult: + def _run_rollout_loop(self, vecenv, policy, args) -> dict: # Inner class pulls pandas/matplotlib — keep the import inside the # rollout so the wrapper class can be imported in environments # that don't have those (e.g. unit-test smoke envs). @@ -38,5 +38,4 @@ def rollout(self, vecenv, policy, args) -> EvalResult: results["total_num_agents"] = float(df["num_agents_per_scene"].sum()) results["total_unique_scenarios"] = float(df.index.unique().shape[0]) results["realism_meta_score_std"] = float(df["realism_meta_score"].std()) - results = {k: (float(v) if hasattr(v, "item") else v) for k, v in results.items()} - return EvalResult(metrics=results, frames=[]) + return {k: (float(v) if hasattr(v, "item") else v) for k, v in results.items()} diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py index e9df7daa8..33a1fb1e1 100644 --- a/tests/test_eval_manager.py +++ b/tests/test_eval_manager.py @@ -300,6 +300,25 @@ def test_behavior_class_cleanup_removes_symlink_dir(tmp_path): assert ev._sampled_dir is None +def test_rollout_records_eval_seconds(): + """Every rollout's metrics dict should include `eval_seconds` so wandb + panels show wall-clock cost per evaluator.""" + import time as _time + + class _Stub(Evaluator): + type_name = "_stub_timing" + + def _run_rollout_loop(self, vecenv, policy, args): + _time.sleep(0.02) # forced floor so the recorded time is > 0 + return {"some_metric": 1.5} + + s = _Stub("test", {}, {}) + result = s.rollout(vecenv=None, policy=None, args={}) + assert "eval_seconds" in result.metrics + assert result.metrics["eval_seconds"] >= 0.02 + assert result.metrics["some_metric"] == 1.5 + + def test_eval_args_compose_train_section_and_clean_macro(): """_build_eval_args must fold train_config['env'] (baseline) + section overrides + clean macro correctly. Section beats baseline, From 38a9c05523b3c1aeeebca267b30f9cf8bcafe223 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 00:43:22 -0400 Subject: [PATCH 16/26] drive.py: fix missing goal_radius in resample-time binding.shared MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The eval-mode resample path at drive.py:447 was missing goal_radius in its kwargs to binding.shared, while the initial-spawn call at line 293 has it. binding.shared's C side (binding.c:1545) requires goal_radius via unpack(), so the resample crashes with "Missing required keyword argument 'goal_radius'" once a scenario batch completes and a new one is requested. Latent bug — never triggered because the legacy [eval] section had multi_scenario_eval = False as the default, so the eval rollout path that triggers resample never fired in production. The new EvalManager flips multi_scenario_eval-equivalent ([eval.validation_gigaflow]) on by default, surfacing the crash. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/drive/drive.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py index 7dc537ca5..e417e3473 100644 --- a/pufferlib/ocean/drive/drive.py +++ b/pufferlib/ocean/drive/drive.py @@ -458,6 +458,7 @@ def step(self, actions): min_agents_per_env=self.min_agents_per_env, max_agents_per_env=self.max_agents_per_env, num_eval_scenarios=self.current_num_eval_scenarios, # Use the dynamic size here + goal_radius=self.goal_radius, ) # In eval mode, don't wrap counter - allows termination condition to work correctly From 6514e8d6a8081bf6f87c1519ee8394a1fa1ab55b Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 00:48:28 -0400 Subject: [PATCH 17/26] drive.ini: merge validation_gigaflow + validation_gigaflow_render Single section now does both: 250-scenario metric pass + 5-scenario render pass via the render_num_scenarios knob. The split made sense before render became a per-section flag; it's redundant now. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index b49cc1de8..320184608 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -219,7 +219,11 @@ enabled = true interval = 250 mode = "inline" clean = true -render = false +; Single rollout: 250-scenario metric pass + a 5-scenario render pass +; for the wandb panel. render_num_scenarios decouples the render budget +; from the metric pass so videos stay cheap. +render = true +render_views = ["sim_state", "bev"] env.simulation_mode = "gigaflow" env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d" env.num_maps = 8 @@ -229,14 +233,7 @@ env.max_agents_per_env = 50 env.scenario_length = 3000 env.resample_frequency = 3000 eval.num_scenarios = 250 - -[eval.validation_gigaflow_render] -inherits = "validation_gigaflow" -enabled = true -interval = 250 -render = true -render_views = ["sim_state", "bev"] -eval.num_scenarios = 5 +eval.render_num_scenarios = 5 ; --------------------------------------------------------------------------- ; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each From 2b074149abd765409e3035725ee371404d5e6e7f Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 00:54:04 -0400 Subject: [PATCH 18/26] [WIP] eval: fix render-path vec.make signature for PufferEnv backend PufferEnv backend in pufferlib.vector.make treats env_creator as a single callable and passes env_args/env_kwargs to it directly (line 697 in vector.py). Multiprocessing/Serial backends expect lists. Render must use PufferEnv (one ffmpeg pipe per env), so pass a single creator + dict, not lists. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../ocean/benchmark/evaluators/multi_scenario.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index bd6c5672a..76f3f0ab0 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -106,14 +106,16 @@ def _render_pass(self, vecenv, policy, args) -> list: view_idx = _VIEW_NAME_TO_IDX.get(view, 0) view_suffix = "" if view == "sim_state" else f"_{view}" + # PufferEnv backend treats the creator as a single callable and + # passes env_args/env_kwargs to it directly (not as per-env lists). + # The Multiprocessing/Serial backends expect lists; we don't use + # those here because EGL render assumes one ffmpeg pipe per env. vec = pufferlib.vector.make( - [make_env], - env_args=[[]], - env_kwargs=[render_env_kwargs], + make_env, + env_args=[], + env_kwargs=render_env_kwargs, backend="PufferEnv", num_envs=1, - num_workers=1, - batch_size=1, ) target = vec if not hasattr(vec, "envs") else vec.envs[0] internal = getattr(target, "num_envs", 1) From 9b7f9d64b8385d85baeca5e45c58577a64c3b647 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 01:14:58 -0400 Subject: [PATCH 19/26] [WIP] eval: validation_gigaflow runs 1 scenario per map, not 250 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 8-map carla dir doesn't need 250 scenarios — gigaflow's C-side eval already creates one internal env per scenario (capped at num_scenarios) and steps them in a single batched rollout, so num_scenarios=8 covers every map exactly once in parallel. Drop num_agents 512→400 to fill exactly 8 × 50 slots with no wasted capacity. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index 320184608..ba3a57231 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -219,20 +219,22 @@ enabled = true interval = 250 mode = "inline" clean = true -; Single rollout: 250-scenario metric pass + a 5-scenario render pass -; for the wandb panel. render_num_scenarios decouples the render budget -; from the metric pass so videos stay cheap. +; One rollout per map (8 carla maps). C-side gigaflow eval cycles maps +; sequentially within one PufferEnv worker: env_count = min(ceil(num_agents +; / max_per_env), num_scenarios), so 8 internal envs (one per map) step +; in parallel via the batched C kernel — no multiprocessing needed. +; render_num_scenarios decouples the render budget so videos stay cheap. render = true render_views = ["sim_state", "bev"] env.simulation_mode = "gigaflow" env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d" env.num_maps = 8 -env.num_agents = 512 +env.num_agents = 400 env.min_agents_per_env = 50 env.max_agents_per_env = 50 env.scenario_length = 3000 env.resample_frequency = 3000 -eval.num_scenarios = 250 +eval.num_scenarios = 8 eval.render_num_scenarios = 5 ; --------------------------------------------------------------------------- From 5b7997eda830398fd2cb620246ec6634978a33e7 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 01:22:12 -0400 Subject: [PATCH 20/26] [WIP] eval: cap render clip length, decouple from metric-pass scenario_length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Render path was falling back to env.scenario_length (3000 = 100s mp4 at 30fps) because render_max_steps wasn't a real config knob. Two fixes: 1. multi_scenario.py: read eval.render_max_steps from args["eval"] where _build_eval_args puts evaluator-private fields, and default to 91 (~3 sec) — not scenario_length, which is the metric pass. 2. drive.ini: set eval.render_max_steps = 91 on validation_gigaflow explicitly so the comment + value document the intent. EGL render is ~3 fps wall-clock at 1080p, so 91 × 5 × 2 ≈ 5 min/render pass instead of 28 min when defaulting to scenario_length=3000. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 3 +++ pufferlib/ocean/benchmark/evaluators/multi_scenario.py | 6 +++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index ba3a57231..c464c4c2a 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -236,6 +236,9 @@ env.scenario_length = 3000 env.resample_frequency = 3000 eval.num_scenarios = 8 eval.render_num_scenarios = 5 +; ~3 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p, +; so 91 steps × 5 scenarios × 2 views ≈ 5 min — well below the eval cadence. +eval.render_max_steps = 91 ; --------------------------------------------------------------------------- ; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index 76f3f0ab0..f69c17ff3 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -141,7 +141,11 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: eval_cfg = self.config.get("eval", {}) metric_count = int(eval_cfg.get("num_scenarios", 1)) num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3))) - max_steps = args.get("render_max_steps") or int(args["env"].get("scenario_length", 91)) + # Render-clip length: independent of scenario_length (which is the + # metric-pass length). At 30 fps, 91 steps = ~3s mp4. Per-step EGL + # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping + # this small directly bounds the render-pass runtime. + max_steps = int(args.get("eval", {}).get("render_max_steps", 91)) saved_cwd = os.getcwd() os.chdir(out_dir) From ec0698a8762dc03de16f6f05b178f3d94852a729 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 01:25:46 -0400 Subject: [PATCH 21/26] =?UTF-8?q?[WIP]=20eval:=20bump=20render=5Fmax=5Fste?= =?UTF-8?q?ps=20default=2091=20=E2=86=92=20300=20(10s=20clip=20vs=203s)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 6 +++--- pufferlib/ocean/benchmark/evaluators/multi_scenario.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index c464c4c2a..de05defa0 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -236,9 +236,9 @@ env.scenario_length = 3000 env.resample_frequency = 3000 eval.num_scenarios = 8 eval.render_num_scenarios = 5 -; ~3 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p, -; so 91 steps × 5 scenarios × 2 views ≈ 5 min — well below the eval cadence. -eval.render_max_steps = 91 +; ~10 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p, +; so 300 steps × 5 scenarios × 2 views ≈ 17 min — below the eval cadence. +eval.render_max_steps = 300 ; --------------------------------------------------------------------------- ; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index f69c17ff3..8f1777320 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -142,10 +142,10 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: metric_count = int(eval_cfg.get("num_scenarios", 1)) num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3))) # Render-clip length: independent of scenario_length (which is the - # metric-pass length). At 30 fps, 91 steps = ~3s mp4. Per-step EGL + # metric-pass length). At 30 fps, 300 steps = ~10s mp4. Per-step EGL # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping # this small directly bounds the render-pass runtime. - max_steps = int(args.get("eval", {}).get("render_max_steps", 91)) + max_steps = int(args.get("eval", {}).get("render_max_steps", 300)) saved_cwd = os.getcwd() os.chdir(out_dir) From cf955d9c3a7f0c846a8573aa93d765350abf0376 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 01:34:55 -0400 Subject: [PATCH 22/26] [WIP] eval: default to PufferEnv inline, keep Multiprocessing opt-in MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old MultiScenarioEvaluator forced Multiprocessing + async_reset by default, which (a) added fork/IPC startup cost without throughput gain at our scale, (b) broke render (one ffmpeg pipe per env requires single-process), and (c) couldn't construct the vec env at all because the manager passed the PufferEnv-shaped single creator to the list-shaped backend dispatch. Three changes: 1. multi_scenario.py: drop the vec_overrides() and _initial_reset() overrides that pinned Multiprocessing — inherit base-class PufferEnv + sync reset. 2. manager.py:_run_inline: branch on backend so Multiprocessing remains a valid opt-in via [eval..vec] backend = "Multiprocessing". Useful for memory-bound replay sweeps, hetero-config evals, or async overlap on long rollouts — not needed for the 8-map carla validation eval that's the hot path today. 3. The 8-map validation eval now spins up one Drive with 8 internal envs in-process, no fork. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../benchmark/evaluators/multi_scenario.py | 19 ++++------- pufferlib/ocean/benchmark/manager.py | 32 +++++++++++++------ 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index 8f1777320..1c161c5db 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -17,12 +17,6 @@ class MultiScenarioEvaluator(Evaluator): type_name: ClassVar[str] = "multi_scenario" - def vec_overrides(self) -> dict: - # Multi-worker by default for throughput. Override via [eval..vec]. - backend = self.train_config.get("vec", {}).get("backend", "PufferEnv") - num_envs = int(self.config.get("vec", {}).get("num_envs", 1)) - return {"backend": backend, "num_envs": num_envs} - def env_overrides(self) -> dict: env = { "eval_mode": 1, @@ -32,13 +26,14 @@ def env_overrides(self) -> dict: env.update(self.config.get("env", {})) return env - # -- Loop hooks -- + # vec_overrides + _initial_reset use the base-class defaults: PufferEnv + # backend with num_envs=1 and a sync reset. Drive's C side already + # allocates `min(ceil(num_agents/max_per_env), num_eval_scenarios)` + # internal envs and steps them in one batched kernel call, so we get + # full per-map parallelism without paying multi-process fork/IPC cost. + # Override [eval..vec] in the ini if you genuinely need workers. - def _initial_reset(self, vecenv, args): - # Multi-worker async reset gives us the parallel-throughput path. - vecenv.async_reset(args.get("seed", 42)) - ob, _, _, _, _, _, _ = vecenv.recv() - return ob + # -- Loop hooks -- def _maybe_reset_lstm(self, state, steps, args): # Reset between scenarios — gigaflow's auto-resample fires at the diff --git a/pufferlib/ocean/benchmark/manager.py b/pufferlib/ocean/benchmark/manager.py index 9e79a50bc..2ec021c08 100644 --- a/pufferlib/ocean/benchmark/manager.py +++ b/pufferlib/ocean/benchmark/manager.py @@ -139,18 +139,30 @@ def _run_inline(self, ev: Evaluator, policy, env_name: str, global_step) -> Eval make_env = env_module.env_creator(env_name) vec_kwargs = ev.vec_overrides() + backend = vec_kwargs.get("backend", "PufferEnv") num_envs = int(vec_kwargs.get("num_envs", 1)) - env_kwargs_list = [args["env"] for _ in range(num_envs)] - env_creators = [make_env] * num_envs - env_args_list = [[]] * num_envs - vec_call_kwargs = dict(vec_kwargs) - vec_call_kwargs.setdefault("num_workers", num_envs) - vec_call_kwargs.setdefault("batch_size", num_envs) - - vecenv = pufferlib.vector.make( - env_creators, env_args=env_args_list, env_kwargs=env_kwargs_list, **vec_call_kwargs - ) + # PufferEnv is the default: Drive's C kernel batches all internal + # envs in one call so we get per-map parallelism without paying + # fork/IPC cost, and render shares the single ffmpeg pipeline. + # Multiprocessing is opt-in via [eval..vec] backend = ... + # for evals that genuinely need it (memory-split for big replay + # sweeps, hetero scenarios, async overlap on long rollouts). + # The two backends have incompatible call shapes; branch here. + if backend == "PufferEnv": + vecenv = pufferlib.vector.make( + make_env, env_args=[], env_kwargs=args["env"], backend=backend, num_envs=num_envs + ) + else: + vec_call_kwargs = dict(vec_kwargs) + vec_call_kwargs.setdefault("num_workers", num_envs) + vec_call_kwargs.setdefault("batch_size", num_envs) + vecenv = pufferlib.vector.make( + [make_env] * num_envs, + env_args=[[]] * num_envs, + env_kwargs=[args["env"] for _ in range(num_envs)], + **vec_call_kwargs, + ) try: res = ev.rollout(vecenv, policy, args) finally: From 725475f0c57291f01063b871c6f1991e57122d1b Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 01:46:23 -0400 Subject: [PATCH 23/26] [WIP] eval: render returns only this-pass mp4s; nuplan paths use real path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes for the multi-evaluator smoke test: 1. multi_scenario._render_view: glob filter excludes mp4s that already existed in out_dir before this pass. The dir is shared across epochs and across views, so a bare `out_dir.glob('*.mp4')` was returning every mp4 from prior render passes too — turning epoch 12510's 16-mp4 render into a 32-mp4 wandb upload (the 16 from epoch 12505 were still on disk). 2. drive.ini: replace `/scratch/$USER/data/nuplan/...` with the literal path. configparser doesn't expand env vars, and the manager doesn't either, so behavior evals were crashing on FileNotFoundError trying to open `/scratch/$USER/data/...`. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/config/ocean/drive.ini | 24 +++++++++---------- .../benchmark/evaluators/multi_scenario.py | 7 +++++- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini index de05defa0..567ff6098 100644 --- a/pufferlib/config/ocean/drive.ini +++ b/pufferlib/config/ocean/drive.ini @@ -269,73 +269,73 @@ eval.render_num_scenarios = 2 inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/nuplan_mini_train_bins" +env.map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins" [eval.behaviors_hard_stop] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/hard_stop" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop" [eval.behaviors_highway_straight] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/highway_straight" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight" [eval.behaviors_lane_change] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/lane_change" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change" [eval.behaviors_merge] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/merge" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge" [eval.behaviors_parked_cars] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/parked_cars" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars" [eval.behaviors_roundabout] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/roundabout" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout" [eval.behaviors_stopped_traffic] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/stopped_traffic" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic" [eval.behaviors_traffic_light_green] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_green" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green" [eval.behaviors_traffic_light_stop] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_stop" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop" [eval.behaviors_unprotected_left] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_left" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left" [eval.behaviors_unprotected_right] inherits = "behaviors_defaults" type = "behavior_class" enabled = true -env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_right" +env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right" ; --------------------------------------------------------------------------- ; Optional: WOSAC realism eval. Off by default. diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index 1c161c5db..054e73cd4 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -144,6 +144,11 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: saved_cwd = os.getcwd() os.chdir(out_dir) + # Snapshot existing mp4s so we only return files written in this + # pass — out_dir is shared across epochs (and across views), so + # globbing the dir at the end would re-pick up every mp4 from prior + # render passes and make _log think we rendered far more than we did. + existing = set(out_dir.glob("*.mp4")) try: state = self._init_lstm_state(num_agents, policy, device, args) scenarios_processed = 0 @@ -173,7 +178,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: finally: os.chdir(saved_cwd) - return sorted(p for p in out_dir.glob("*.mp4")) + return sorted(p for p in out_dir.glob("*.mp4") if p not in existing) _VIEW_NAME_TO_IDX = { From 38eec92094a041c19caa002bb7a9bb212db9c9a4 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 01:47:48 -0400 Subject: [PATCH 24/26] [WIP] eval: stamp global_step into render mp4 filenames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously `{scenario_id}{view_suffix}.mp4` — each render epoch overwrote the previous epoch's mp4 in place, making the prior fix's existing-file snapshot filter incorrect (filter would exclude the freshly-written file because Path equality matches the prior path). Now `{scenario_id}_step{N}{view_suffix}.mp4` so: - successive eval epochs produce distinct mp4s (no overwrites). - wandb's render carousel shows one entry per epoch, letting the user watch policy evolve over training. - the return-paths glob is exact: just files matching this step's pattern, no snapshot-filtering trickery. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../benchmark/evaluators/multi_scenario.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index 054e73cd4..5a0861dd0 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -96,10 +96,17 @@ def _render_pass(self, vecenv, policy, args) -> list: if num_maps > 1: render_env_kwargs["starting_map"] = random.randint(0, num_maps - 1) + # Stamp the training step into the filename so successive epochs + # produce distinct mp4s (Town01.xodr_step25100000_bev.mp4) instead + # of overwriting in place. wandb then shows one entry per epoch + # in the render carousel — useful for watching policy evolve over + # training. global_step falls back to 0 for ad-hoc CLI runs. + step_suffix = f"_step{int(args.get('global_step') or 0)}" + all_paths = [] for view in self.render_views: view_idx = _VIEW_NAME_TO_IDX.get(view, 0) - view_suffix = "" if view == "sim_state" else f"_{view}" + view_suffix = step_suffix + ("" if view == "sim_state" else f"_{view}") # PufferEnv backend treats the creator as a single callable and # passes env_args/env_kwargs to it directly (not as per-env lists). @@ -144,11 +151,13 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: saved_cwd = os.getcwd() os.chdir(out_dir) - # Snapshot existing mp4s so we only return files written in this - # pass — out_dir is shared across epochs (and across views), so - # globbing the dir at the end would re-pick up every mp4 from prior - # render passes and make _log think we rendered far more than we did. - existing = set(out_dir.glob("*.mp4")) + # Filename pattern for this pass: each scenario writes + # `{scenario_id}_step{N}{view_suffix}.mp4`. Globbing by step suffix + # picks up only this-pass mp4s and ignores accumulated files from + # prior epochs that share the dir. Source of truth for the suffix + # is _render_pass (it set_video_suffix'd each env before calling + # us), so we read it back from any active env to keep them aligned. + step_glob = f"*_step{int(args.get('global_step') or 0)}*.mp4" try: state = self._init_lstm_state(num_agents, policy, device, args) scenarios_processed = 0 @@ -178,7 +187,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: finally: os.chdir(saved_cwd) - return sorted(p for p in out_dir.glob("*.mp4") if p not in existing) + return sorted(out_dir.glob(step_glob)) _VIEW_NAME_TO_IDX = { From 0e83577bcd99281fc339ad58e5ae18a92e4ae4c7 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 02:01:01 -0400 Subject: [PATCH 25/26] [WIP] eval: lift render to base, behaviors now render MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Behavior-class evaluators previously had render=true in config but no _render_pass implementation, so the videos silently never appeared. Three changes: 1. base.py: lift _render_pass + _render_view from MultiScenarioEvaluator into Evaluator. The render loop is generic — fresh PufferEnv with render_mode=headless, ffmpeg pipe per active env per view, mp4s stamped with global_step in the filename. 2. base.py: add _render_env_overrides hook so subclasses can tweak the render env (default = metric env + render_mode=headless). Render loop now caps internal-env render count at eval.render_num_scenarios instead of always rendering the full batch (the C kernel still steps the full batch — just fewer ffmpeg pipes). Drops the dead batch_size_eval write. 3. multi_scenario.py: keeps the random-starting_map override (its only real difference from the default render path) — everything else is now inherited. Result: behavior_class and human_replay inherit a working render path. For 12 behavior classes with render_num_scenarios=2, render is bounded by ~3 fps × 300 steps × 2 views ≈ 3.5 min/class. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/benchmark/evaluators/base.py | 131 ++++++++++++- .../benchmark/evaluators/multi_scenario.py | 173 ++---------------- 2 files changed, 141 insertions(+), 163 deletions(-) diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py index 08406b5cd..999102ca6 100644 --- a/pufferlib/ocean/benchmark/evaluators/base.py +++ b/pufferlib/ocean/benchmark/evaluators/base.py @@ -159,6 +159,133 @@ def _aggregate_infos(self, infos: list) -> dict: out[k] = float(np.mean(vals)) return out + # -- Render (default EGL → ffmpeg mp4 pipeline) ---------------------- + def _render_pass(self, vecenv, policy, args) -> list: - """Render hook. Subclasses that support frame capture override this.""" - return [] + """Build a fresh PufferEnv with `render_mode=headless`, render one + clip per (scenario, view), return mp4 paths. Returns [] for non-egl + backends. Subclasses customize the render env via `_render_env_overrides`. + """ + backend = args.get("render_backend", "egl") + if backend != "egl": + return [] + + import importlib + from pathlib import Path + + import pufferlib + + out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4" + out_dir.mkdir(parents=True, exist_ok=True) + + package = args.get("package", "ocean") + module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}" + env_module = importlib.import_module(module_name) + make_env = env_module.env_creator(args["env_name"]) + + render_env_kwargs = self._render_env_overrides(args) + # Stamp the training step into the filename so successive epochs + # produce distinct mp4s and wandb's render carousel shows policy + # evolution. global_step falls back to 0 for ad-hoc CLI runs. + step_suffix = f"_step{int(args.get('global_step') or 0)}" + + all_paths = [] + for view in self.render_views: + view_idx = _VIEW_NAME_TO_IDX.get(view, 0) + view_suffix = step_suffix + ("" if view == "sim_state" else f"_{view}") + + vec = pufferlib.vector.make( + make_env, + env_args=[], + env_kwargs=render_env_kwargs, + backend="PufferEnv", + num_envs=1, + ) + target = vec if not hasattr(vec, "envs") else vec.envs[0] + internal = getattr(target, "num_envs", 1) + for e in range(internal): + target.set_video_suffix(view_suffix, env_idx=e) + + paths = self._render_view(vec, target, policy, args, view_idx, out_dir, step_suffix) + vec.close() + all_paths.extend(paths) + return all_paths + + def _render_env_overrides(self, args) -> dict: + """Build env kwargs for the render env. Default: same as the + metric-pass env plus `render_mode=headless`. Subclasses override + to inject things like a random starting_map (gigaflow validation) + or a shrunken bin set (behavior class).""" + out = dict(args["env"]) + out["render_mode"] = "headless" + return out + + def _render_view(self, vecenv, target_env, policy, args, view_idx, out_dir, step_suffix) -> list: + """One rollout per render-env, writes one mp4 per active env per view. + Caps how many internal envs actually feed ffmpeg pipes via + `eval.render_num_scenarios` so render cost stays bounded.""" + import os + + import numpy as np + import torch + + import pufferlib + + device = args["train"]["device"] + num_agents = vecenv.observation_space.shape[0] + + eval_cfg = self.config.get("eval", {}) + metric_count = int(eval_cfg.get("num_scenarios", 1)) + num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3))) + # Render-clip length: independent of scenario_length (which is the + # metric-pass length). At 30 fps, 300 steps = ~10s mp4. Per-step EGL + # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping + # this small directly bounds the render-pass runtime. + max_steps = int(eval_cfg.get("render_max_steps", 300)) + + saved_cwd = os.getcwd() + os.chdir(out_dir) + # Glob for files written this pass: every mp4 has the step suffix, + # so a step_suffix-prefixed glob filters out accumulated mp4s from + # prior epochs (the dir is shared across runs). + step_glob = f"*{step_suffix}*.mp4" + try: + state = self._init_lstm_state(num_agents, policy, device, args) + scenarios_processed = 0 + while scenarios_processed < num_scenarios: + ob, _ = vecenv.reset() + scenarios = vecenv.get_state() + num_in_batch = len(scenarios) + # Cap how many envs render this iteration: the C kernel + # steps the full batch regardless, but only the first + # `to_render` envs feed ffmpeg pipes. + to_render = min(num_in_batch, num_scenarios - scenarios_processed) + if state: + state["lstm_h"].zero_() + state["lstm_c"].zero_() + for _ in range(max_steps): + with torch.no_grad(): + ob_t = torch.as_tensor(ob).to(device) + logits, _ = policy.forward_eval(ob_t, state) + action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) + action = action.cpu().numpy().reshape(vecenv.action_space.shape) + if isinstance(logits, torch.distributions.Normal): + action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) + ob, _, _, _, _ = vecenv.step(action) + for e in range(to_render): + target_env.render(env_idx=e, view_mode=view_idx) + for e in range(to_render): + target_env.close_client(env_idx=e) + scenarios_processed += to_render + finally: + os.chdir(saved_cwd) + + return sorted(out_dir.glob(step_glob)) + + +_VIEW_NAME_TO_IDX = { + "sim_state": 0, + "bev": 1, + "topdown_sim": 2, + "bev_all": 3, +} diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py index 5a0861dd0..27930dea3 100644 --- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py +++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py @@ -1,14 +1,7 @@ -"""MultiScenarioEvaluator — distribute scenarios across workers, one rollout -per scenario, mean per-scenario metrics. Drives both the gigaflow validation -path and replay-style multi-scenario evals. +"""MultiScenarioEvaluator — gigaflow validation eval. C-side eval_mode +cycles maps sequentially in one batched rollout, so the base loop + +PufferEnv defaults handle parallelism without multi-process workers.""" -Inherits the default loop from `Evaluator`; overrides `_should_stop` (cap by -scenario count), `_initial_reset` (async reset for multi-worker throughput), -`_maybe_reset_lstm` (per-scenario LSTM reset), and `_render_pass` (the C-side -EGL → ffmpeg mp4 dump).""" - -import os -from pathlib import Path from typing import ClassVar from pufferlib.ocean.benchmark.evaluators.base import Evaluator @@ -26,15 +19,6 @@ def env_overrides(self) -> dict: env.update(self.config.get("env", {})) return env - # vec_overrides + _initial_reset use the base-class defaults: PufferEnv - # backend with num_envs=1 and a sync reset. Drive's C side already - # allocates `min(ceil(num_agents/max_per_env), num_eval_scenarios)` - # internal envs and steps them in one batched kernel call, so we get - # full per-map parallelism without paying multi-process fork/IPC cost. - # Override [eval..vec] in the ini if you genuinely need workers. - - # -- Loop hooks -- - def _maybe_reset_lstm(self, state, steps, args): # Reset between scenarios — gigaflow's auto-resample fires at the # end of scenario_length, so steps % scenario_length == 0 is the @@ -50,149 +34,16 @@ def _should_stop(self, args, infos_collected, steps) -> bool: target = int(self.config.get("eval", {}).get("num_scenarios", 1)) return len(infos_collected) >= target - # -- Render -- - - def _render_pass(self, vecenv, policy, args) -> list: - """One rollout per view, all writing mp4s to a single dir. - - Builds a fresh single-worker env per view (C-side ffmpeg-per-env - wiring assumes one bin at a time per process). Render budget and - starting position are independent of the metric pass: - - eval.render_num_scenarios — how many scenarios to render. Defaults - to min(eval.num_scenarios, 3). Always respected over - num_scenarios so renders stay cheap. - starting_map — randomized per render epoch so successive epochs - show different scenarios from the dir, not the same first-N - alphabetically. Set explicitly in env.* to pin. - """ - import importlib + def _render_env_overrides(self, args) -> dict: + # Random starting_map per render epoch — every epoch shows a + # different bin from the dir rather than the same alphabetical + # first-N. Pin by setting env.starting_map explicitly in the + # [eval.] section. import random - import pufferlib - - backend = args.get("render_backend", "egl") - if backend != "egl": - return [] - - env_name = args["env_name"] - out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4" - out_dir.mkdir(parents=True, exist_ok=True) - - package = args.get("package", "ocean") - module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}" - env_module = importlib.import_module(module_name) - make_env = env_module.env_creator(env_name) - - render_env_kwargs = dict(args["env"]) - render_env_kwargs["render_mode"] = "headless" - - # Random starting map per render epoch — every epoch shows a - # different bin from the directory rather than the first N - # alphabetically. The user can pin by setting env.starting_map - # explicitly in the [eval.] section. + out = super()._render_env_overrides(args) if "starting_map" not in self.config.get("env", {}): - num_maps = int(render_env_kwargs.get("num_maps", 1)) + num_maps = int(out.get("num_maps", 1)) if num_maps > 1: - render_env_kwargs["starting_map"] = random.randint(0, num_maps - 1) - - # Stamp the training step into the filename so successive epochs - # produce distinct mp4s (Town01.xodr_step25100000_bev.mp4) instead - # of overwriting in place. wandb then shows one entry per epoch - # in the render carousel — useful for watching policy evolve over - # training. global_step falls back to 0 for ad-hoc CLI runs. - step_suffix = f"_step{int(args.get('global_step') or 0)}" - - all_paths = [] - for view in self.render_views: - view_idx = _VIEW_NAME_TO_IDX.get(view, 0) - view_suffix = step_suffix + ("" if view == "sim_state" else f"_{view}") - - # PufferEnv backend treats the creator as a single callable and - # passes env_args/env_kwargs to it directly (not as per-env lists). - # The Multiprocessing/Serial backends expect lists; we don't use - # those here because EGL render assumes one ffmpeg pipe per env. - vec = pufferlib.vector.make( - make_env, - env_args=[], - env_kwargs=render_env_kwargs, - backend="PufferEnv", - num_envs=1, - ) - target = vec if not hasattr(vec, "envs") else vec.envs[0] - internal = getattr(target, "num_envs", 1) - for e in range(internal): - target.set_video_suffix(view_suffix, env_idx=e) - - paths = self._render_view(vec, target, policy, args, view_idx, out_dir) - vec.close() - all_paths.extend(paths) - return all_paths - - def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: Path) -> list: - import numpy as np - import torch - - import pufferlib - - device = args["train"]["device"] - num_agents = vecenv.observation_space.shape[0] - # Render budget defaults to min(num_scenarios, 3) if not set explicitly. - # Renders are expensive (mp4 encode + wandb upload) so we don't want - # them at metric-pass scale. - eval_cfg = self.config.get("eval", {}) - metric_count = int(eval_cfg.get("num_scenarios", 1)) - num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3))) - # Render-clip length: independent of scenario_length (which is the - # metric-pass length). At 30 fps, 300 steps = ~10s mp4. Per-step EGL - # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping - # this small directly bounds the render-pass runtime. - max_steps = int(args.get("eval", {}).get("render_max_steps", 300)) - - saved_cwd = os.getcwd() - os.chdir(out_dir) - # Filename pattern for this pass: each scenario writes - # `{scenario_id}_step{N}{view_suffix}.mp4`. Globbing by step suffix - # picks up only this-pass mp4s and ignores accumulated files from - # prior epochs that share the dir. Source of truth for the suffix - # is _render_pass (it set_video_suffix'd each env before calling - # us), so we read it back from any active env to keep them aligned. - step_glob = f"*_step{int(args.get('global_step') or 0)}*.mp4" - try: - state = self._init_lstm_state(num_agents, policy, device, args) - scenarios_processed = 0 - while scenarios_processed < num_scenarios: - ob, _ = vecenv.reset() - scenarios = vecenv.get_state() - num_in_batch = len(scenarios) - remaining = num_scenarios - scenarios_processed - num_in_batch - target_env.batch_size_eval = max(1, remaining) - if state: - state["lstm_h"].zero_() - state["lstm_c"].zero_() - for _ in range(max_steps): - with torch.no_grad(): - ob_t = torch.as_tensor(ob).to(device) - logits, _ = policy.forward_eval(ob_t, state) - action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True) - action = action.cpu().numpy().reshape(vecenv.action_space.shape) - if isinstance(logits, torch.distributions.Normal): - action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high) - ob, _, _, _, _ = vecenv.step(action) - for e in range(num_in_batch): - target_env.render(env_idx=e, view_mode=view_idx) - for e in range(num_in_batch): - target_env.close_client(env_idx=e) - scenarios_processed += num_in_batch - finally: - os.chdir(saved_cwd) - - return sorted(out_dir.glob(step_glob)) - - -_VIEW_NAME_TO_IDX = { - "sim_state": 0, - "bev": 1, - "topdown_sim": 2, - "bev_all": 3, -} + out["starting_map"] = random.randint(0, num_maps - 1) + return out From 7253022eeba69a872ed67c97bfb539a1d4d91fe3 Mon Sep 17 00:00:00 2001 From: Eugene Vinitsky Date: Sun, 10 May 2026 02:12:40 -0400 Subject: [PATCH 26/26] [WIP] eval: render output into per-evaluator subdir Every evaluator runs at the same global_step, so the previous shared out_dir + step glob made each evaluator's _render_view re-collect every earlier evaluator's mp4s into result.frames. wandb then logged validation_gigaflow's videos under behaviors_*/render too. Per-evaluator subdir (`mp4//`) keeps each evaluator's render output isolated. Co-Authored-By: Claude Opus 4.7 (1M context) --- pufferlib/ocean/benchmark/evaluators/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py index 999102ca6..edf839d7e 100644 --- a/pufferlib/ocean/benchmark/evaluators/base.py +++ b/pufferlib/ocean/benchmark/evaluators/base.py @@ -175,7 +175,11 @@ def _render_pass(self, vecenv, policy, args) -> list: import pufferlib - out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4" + # Per-evaluator subdir so each evaluator's mp4s don't get re-globbed + # by the next evaluator's _render_view (every evaluator runs at the + # same global_step, so a shared dir + step glob would collect every + # earlier evaluator's mp4s into this one's result.frames). + out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4" / self.name out_dir.mkdir(parents=True, exist_ok=True) package = args.get("package", "ocean")