From 32738fb5c2bfca6ede8d9e8c3e298a47262dfd6e Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@Eugenes-MacBook-Air.local>
Date: Sat, 9 May 2026 18:27:45 -0400
Subject: [PATCH 01/26] [WIP] eval: add unified Evaluator + EvalManager (no
 functional integration yet)

Adds the new evaluator framework as pure additions:

  pufferlib/ocean/benchmark/evaluators/
    base.py            Evaluator + EvalResult dataclass
    multi_scenario.py  MultiScenarioEvaluator (replaces eval_multi_scenarios)
    human_replay.py    HumanReplayEvaluator (replay + control_sdc_only loop)
    behavior_class.py  BehaviorClassEvaluator (per-class nuPlan suite)
    wosac.py           Thin wrapper around the existing WOSACEvaluator
  pufferlib/ocean/benchmark/manager.py
    EvalManager: section discovery, inheritance chain, clean-eval macro,
    dotted-key flattening, inline + subprocess dispatch, wandb logging.

Plus tests/test_eval_manager.py for the config-merge logic.

See docs/eval_unification.md for the full design rationale.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                                    |   2 +
 .../ocean/benchmark/evaluators/__init__.py    |  33 +++
 pufferlib/ocean/benchmark/evaluators/base.py  |  76 +++++
 .../benchmark/evaluators/behavior_class.py    |  57 ++++
 .../benchmark/evaluators/human_replay.py      |  79 +++++
 .../benchmark/evaluators/multi_scenario.py    | 207 ++++++++++++++
 pufferlib/ocean/benchmark/evaluators/wosac.py |  42 +++
 pufferlib/ocean/benchmark/manager.py          | 269 ++++++++++++++++++
 tests/test_eval_manager.py                    | 122 ++++++++
 9 files changed, 887 insertions(+)
 create mode 100644 pufferlib/ocean/benchmark/evaluators/__init__.py
 create mode 100644 pufferlib/ocean/benchmark/evaluators/base.py
 create mode 100644 pufferlib/ocean/benchmark/evaluators/behavior_class.py
 create mode 100644 pufferlib/ocean/benchmark/evaluators/human_replay.py
 create mode 100644 pufferlib/ocean/benchmark/evaluators/multi_scenario.py
 create mode 100644 pufferlib/ocean/benchmark/evaluators/wosac.py
 create mode 100644 pufferlib/ocean/benchmark/manager.py
 create mode 100644 tests/test_eval_manager.py
diff --git a/.gitignore b/.gitignore
index 4ca0ece3c..782cfdf36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -148,6 +148,8 @@ dmypy.json
 checkpoints/
 experiments/
 benchmark*/
+!pufferlib/ocean/benchmark/
+!pufferlib/ocean/benchmark/**
 wandb/
 .neptune/
 raylib*/
diff --git a/pufferlib/ocean/benchmark/evaluators/__init__.py b/pufferlib/ocean/benchmark/evaluators/__init__.py
new file mode 100644
index 000000000..d7594bffc
--- /dev/null
+++ b/pufferlib/ocean/benchmark/evaluators/__init__.py
@@ -0,0 +1,33 @@
+"""Unified evaluator framework for PufferDrive.
+
+Each Evaluator subclass owns one rollout pattern. The EvalManager (parent
+package) discovers evaluators from `[eval.<name>]` sections in drive.ini
+and dispatches them inline (during training) or as subprocesses.
+
+See docs/eval_unification.md for the full design rationale.
+"""
+
+from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+from pufferlib.ocean.benchmark.evaluators.behavior_class import BehaviorClassEvaluator
+from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator
+from pufferlib.ocean.benchmark.evaluators.multi_scenario import MultiScenarioEvaluator
+from pufferlib.ocean.benchmark.evaluators.wosac import WOSACEvaluator
+
+# Type registry for [eval.<name>].type → class lookup. Manager uses this
+# to instantiate the right subclass per config section.
+EVALUATOR_REGISTRY = {
+    "multi_scenario": MultiScenarioEvaluator,
+    "behavior_class": BehaviorClassEvaluator,
+    "human_replay": HumanReplayEvaluator,
+    "wosac": WOSACEvaluator,
+}
+
+__all__ = [
+    "EVALUATOR_REGISTRY",
+    "EvalResult",
+    "Evaluator",
+    "MultiScenarioEvaluator",
+    "BehaviorClassEvaluator",
+    "HumanReplayEvaluator",
+    "WOSACEvaluator",
+]
diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py
new file mode 100644
index 000000000..066c1919d
--- /dev/null
+++ b/pufferlib/ocean/benchmark/evaluators/base.py
@@ -0,0 +1,76 @@
+"""Evaluator base class + EvalResult dataclass."""
+
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import ClassVar
+
+
+@dataclass
+class EvalResult:
+    metrics: dict
+    frames: list = field(default_factory=list)
+
+
+class Evaluator:
+    """Base class for all evaluators.
+
+    Subclasses set `type_name` (the value used in `[eval.<name>].type`) and
+    implement `rollout()`. Optionally override `env_overrides()`,
+    `vec_overrides()`, and `aggregate()`.
+    """
+
+    type_name: ClassVar[str] = ""
+
+    def __init__(self, name: str, config: dict, train_config: dict):
+        # `name` = the [eval.<name>] section name. Used as the wandb prefix.
+        self.name = name
+        # `config` = merged per-evaluator config (after inheritance + clean
+        # macro expansion). Has nested `env`, `vec`, plus flat scalar knobs.
+        self.config = config
+        # `train_config` = the full training config from drive.ini, used as
+        # the base layer that `config` overrides on top of.
+        self.train_config = train_config
+
+        # Common scalars pulled out for ergonomics.
+        self.enabled: bool = bool(config.get("enabled", True))
+        self.interval: int = int(config.get("interval", 0))
+        self.mode: str = config.get("mode", "inline")
+        self.render: bool = bool(config.get("render", False))
+        self.render_views: list = list(config.get("render_views", ["sim_state"]))
+        self.clean: bool = bool(config.get("clean", True))
+
+    def env_overrides(self) -> dict:
+        """Per-evaluator [env] overrides. Defaults to whatever the section
+        wrote under `env.*`. Subclasses can override to add baseline knobs."""
+        return dict(self.config.get("env", {}))
+
+    def vec_overrides(self) -> dict:
+        """Per-evaluator [vec] overrides. Default: serial single-worker —
+        the safe default for replay-style evals where each worker is a
+        single bin replay. Subclasses that want parallel throughput
+        (gigaflow validation) override this."""
+        base = {"backend": "PufferEnv", "num_envs": 1}
+        base.update(self.config.get("vec", {}))
+        return base
+
+    def rollout(self, vecenv, policy, args) -> EvalResult:
+        raise NotImplementedError
+
+    def aggregate(self, per_rollout: list) -> dict:
+        """Reduce a list of per-rollout dicts to a single metrics dict.
+
+        Default: numeric mean over keys present in any sub-dict. WOSAC
+        overrides for likelihood-style aggregation."""
+        import numpy as np
+
+        if not per_rollout:
+            return {}
+        keys = set()
+        for r in per_rollout:
+            keys.update(r.keys())
+        out = {}
+        for k in keys:
+            vals = [r[k] for r in per_rollout if k in r and isinstance(r[k], (int, float))]
+            if vals:
+                out[k] = float(np.mean(vals))
+        return out
diff --git a/pufferlib/ocean/benchmark/evaluators/behavior_class.py b/pufferlib/ocean/benchmark/evaluators/behavior_class.py
new file mode 100644
index 000000000..2d3a7f45c
--- /dev/null
+++ b/pufferlib/ocean/benchmark/evaluators/behavior_class.py
@@ -0,0 +1,57 @@
+"""BehaviorClassEvaluator — one nuPlan behavior category at a time.
+
+Runs a HumanReplayEvaluator-style rollout against a single map_dir, with
+optional fresh random sampling each pass when `num_scenarios` < total bins.
+"""
+
+import os
+import random
+import shutil
+import tempfile
+from typing import ClassVar
+
+from pufferlib.ocean.benchmark.evaluators.base import EvalResult
+from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator
+
+
+class BehaviorClassEvaluator(HumanReplayEvaluator):
+    type_name: ClassVar[str] = "behavior_class"
+
+    def __init__(self, name, config, train_config):
+        super().__init__(name, config, train_config)
+        self._sampled_dir = None  # tmp symlink dir created per pass
+
+    def env_overrides(self) -> dict:
+        # Reuse HumanReplay's defaults, then handle the random-sampling
+        # cap. If num_scenarios is smaller than total bins, build a tmp
+        # symlink dir with a fresh sample each pass and point map_dir there.
+        env = super().env_overrides()
+        map_dir = env.get("map_dir", "")
+        if not map_dir or not os.path.isdir(map_dir):
+            return env
+
+        num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 0))
+        all_bins = [f for f in os.listdir(map_dir) if f.endswith(".bin")]
+        if num_scenarios > 0 and num_scenarios < len(all_bins):
+            sampled = random.sample(all_bins, num_scenarios)
+            self._sampled_dir = tempfile.mkdtemp(prefix=f"{self.name}_")
+            for fname in sampled:
+                os.symlink(os.path.join(map_dir, fname), os.path.join(self._sampled_dir, fname))
+            env["map_dir"] = self._sampled_dir
+            env["num_agents"] = num_scenarios
+            env["num_maps"] = num_scenarios
+        else:
+            env["num_agents"] = len(all_bins)
+            env["num_maps"] = len(all_bins)
+        return env
+
+    def rollout(self, vecenv, policy, args) -> EvalResult:
+        result = super().rollout(vecenv, policy, args)
+        # Manager owns the cleanup window — defer rmtree until after vecenv.close
+        # so any open file descriptors on the symlinks are released first.
+        return result
+
+    def cleanup(self):
+        if self._sampled_dir and os.path.isdir(self._sampled_dir):
+            shutil.rmtree(self._sampled_dir, ignore_errors=True)
+            self._sampled_dir = None
diff --git a/pufferlib/ocean/benchmark/evaluators/human_replay.py b/pufferlib/ocean/benchmark/evaluators/human_replay.py
new file mode 100644
index 000000000..28e19e22b
--- /dev/null
+++ b/pufferlib/ocean/benchmark/evaluators/human_replay.py
@@ -0,0 +1,79 @@
+"""HumanReplayEvaluator — replay mode + control_sdc_only, one rollout per
+bin in the map_dir, mean of per-episode info dicts."""
+
+import os
+from typing import ClassVar
+
+import numpy as np
+import torch
+
+import pufferlib
+from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+
+
+class HumanReplayEvaluator(Evaluator):
+    type_name: ClassVar[str] = "human_replay"
+
+    def env_overrides(self) -> dict:
+        env = {
+            "simulation_mode": "replay",
+            "control_mode": "control_sdc_only",
+            "init_mode": "create_all_valid",
+            "eval_mode": 1,
+            "termination_mode": 0,
+            "reward_randomization": False,
+        }
+        env.update(self.config.get("env", {}))
+        # num_agents = number of bins so each gets one episode slot
+        if "num_agents" not in env:
+            map_dir = env.get("map_dir", "")
+            if map_dir and os.path.isdir(map_dir):
+                env["num_agents"] = len([f for f in os.listdir(map_dir) if f.endswith(".bin")])
+                env["num_maps"] = env["num_agents"]
+        return env
+
+    def rollout(self, vecenv, policy, args) -> EvalResult:
+        device = args["train"]["device"]
+        scenario_length = int(args["env"]["scenario_length"])
+        init_steps = int(args["env"].get("init_steps", 0))
+        num_maps = int(args["env"]["num_maps"])
+        num_agents = vecenv.observation_space.shape[0]
+
+        # +1 step margin: env emits done on the step after scenario_length.
+        total_steps = (scenario_length - init_steps + 1) * num_maps
+
+        obs, _ = vecenv.reset()
+        state = {}
+        if args["train"]["use_rnn"]:
+            state = dict(
+                lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
+                lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
+            )
+
+        all_infos = []
+        for _ in range(total_steps):
+            with torch.no_grad():
+                ob_t = torch.as_tensor(obs).to(device)
+                logits, _ = policy.forward_eval(ob_t, state)
+                action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
+                action_np = action.cpu().numpy().reshape(vecenv.action_space.shape)
+            if isinstance(logits, torch.distributions.Normal):
+                action_np = np.clip(action_np, vecenv.action_space.low, vecenv.action_space.high)
+            obs, _, _, _, info_list = vecenv.step(action_np)
+            if info_list:
+                all_infos.extend(info_list)
+            # Stop once every bin has yielded one info to avoid double-counting
+            # on the second cycle through the dir.
+            if len(all_infos) >= num_maps:
+                break
+
+        if not all_infos:
+            return EvalResult(metrics={"num_scenarios_completed": 0})
+
+        metrics = {"num_scenarios_completed": float(len(all_infos))}
+        keys = set().union(*(d.keys() for d in all_infos))
+        for k in keys:
+            vals = [d[k] for d in all_infos if isinstance(d.get(k), (int, float))]
+            if vals:
+                metrics[k] = float(np.mean(vals))
+        return EvalResult(metrics=metrics, frames=[])
diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
new file mode 100644
index 000000000..365505bfd
--- /dev/null
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -0,0 +1,207 @@
+"""MultiScenarioEvaluator — distribute scenarios across workers, one rollout
+per scenario, mean per-scenario metrics."""
+
+import contextlib
+import os
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+import tqdm
+
+import pufferlib
+from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+
+
+class MultiScenarioEvaluator(Evaluator):
+    type_name = "multi_scenario"
+
+    def vec_overrides(self) -> dict:
+        # Multi-worker by default for throughput. Override via [eval.<name>.vec].
+        backend = self.train_config.get("vec", {}).get("backend", "PufferEnv")
+        num_envs = int(self.config.get("vec", {}).get("num_envs", 1))
+        return {"backend": backend, "num_envs": num_envs}
+
+    def env_overrides(self) -> dict:
+        # Sensible defaults for the gigaflow path; replay configs are expected
+        # to set the relevant knobs in [eval.<name>.env.*].
+        env = {
+            "eval_mode": 1,
+            "termination_mode": 0,
+            "reward_randomization": False,
+        }
+        env.update(self.config.get("env", {}))
+        return env
+
+    def rollout(self, vecenv, policy, args) -> EvalResult:
+        t0 = time.time()
+        num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1))
+        scenario_length = int(args["env"].get("scenario_length", 91))
+        device = args["train"]["device"]
+        num_agents = vecenv.observation_space.shape[0]
+
+        global_infos = {}
+
+        # LSTM hidden state shared across the rollout; reset each scenario batch.
+        state = {}
+        if args["train"]["use_rnn"]:
+            state = dict(
+                lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
+                lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
+            )
+
+        vecenv.async_reset(args.get("seed", 42))
+        ob, _, _, _, infos, _, _ = vecenv.recv()
+        scenarios_processed = 0
+        with tqdm.tqdm(total=num_scenarios, desc=f"[{self.name}] scenarios", disable=args.get("quiet", False)) as pbar:
+            while scenarios_processed < num_scenarios:
+                if args["train"]["use_rnn"]:
+                    state["lstm_h"].zero_()
+                    state["lstm_c"].zero_()
+
+                for _ in range(scenario_length):
+                    with torch.no_grad():
+                        ob_t = torch.as_tensor(ob).to(device)
+                        logits, _ = policy.forward_eval(ob_t, state)
+                        action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
+                        action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+                    if isinstance(logits, torch.distributions.Normal):
+                        action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
+
+                    ob, _, _, _, infos = vecenv.step(action)
+
+                    if infos and infos[0]:
+                        for sub_env in infos:
+                            for env_idx, summary in enumerate(sub_env):
+                                map_name = summary["map_name"].split("/")[-1].split(".")[0]
+                                summary["episode_id"] = env_idx
+                                summary["map_name"] = map_name
+                                scenarios_processed += 1
+                                pbar.update(1)
+                                for k, v in summary.items():
+                                    global_infos.setdefault(k, []).append(v)
+
+        metrics = self._average(global_infos)
+        if not args.get("quiet", False):
+            print(f"[{self.name}] {scenarios_processed} scenarios in {time.time() - t0:.1f}s")
+
+        frames = []
+        if self.render:
+            frames = self._render_pass(vecenv, policy, args)
+
+        return EvalResult(metrics=metrics, frames=frames)
+
+    def _average(self, global_infos: dict) -> dict:
+        out = {}
+        import numbers
+
+        for k, vs in global_infos.items():
+            if k == "num_scenarios":
+                out[k] = float(np.sum(vs))
+            elif vs and isinstance(vs[0], numbers.Number):
+                out[k] = float(np.mean(vs))
+        return out
+
+    def _render_pass(self, vecenv, policy, args) -> list:
+        """One rollout per view, all writing mp4s to a single dir.
+
+        Re-uses the same vecenv if it's a single-worker setup; otherwise
+        delegates to a serial render env built fresh per view.
+        """
+        import importlib
+
+        env_name = args["env_name"]
+        backend = args.get("render_backend", "egl")
+        if backend != "egl":
+            return []
+
+        out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4"
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        # Render with a fresh single-worker env so frame capture is sequential
+        # and starting_map_counter starts at 0. Multi-worker render doesn't
+        # match the C-side ffmpeg-per-env wiring cleanly.
+        package = args.get("package", "ocean")
+        module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}"
+        env_module = importlib.import_module(module_name)
+        make_env = env_module.env_creator(env_name)
+
+        render_env_kwargs = dict(args["env"])
+        render_env_kwargs["render_mode"] = "headless"
+
+        all_paths = []
+        for view in self.render_views:
+            view_idx = _VIEW_NAME_TO_IDX.get(view, 0)
+            view_suffix = "" if view == "sim_state" else f"_{view}"
+
+            vec = pufferlib.vector.make(
+                [make_env],
+                env_args=[[]],
+                env_kwargs=[render_env_kwargs],
+                backend="PufferEnv",
+                num_envs=1,
+                num_workers=1,
+                batch_size=1,
+            )
+            target = vec if not hasattr(vec, "envs") else vec.envs[0]
+            internal = getattr(target, "num_envs", 1)
+            for e in range(internal):
+                target.set_video_suffix(view_suffix, env_idx=e)
+
+            paths = self._render_view(vec, target, policy, args, view_idx, out_dir)
+            vec.close()
+            all_paths.extend(paths)
+        return all_paths
+
+    def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: Path) -> list:
+        device = args["train"]["device"]
+        num_agents = vecenv.observation_space.shape[0]
+        num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1))
+        max_steps = args.get("render_max_steps") or int(args["env"].get("scenario_length", 91))
+
+        saved_cwd = os.getcwd()
+        os.chdir(out_dir)
+        try:
+            state = {}
+            if args["train"]["use_rnn"]:
+                state = dict(
+                    lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
+                    lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
+                )
+            scenarios_processed = 0
+            while scenarios_processed < num_scenarios:
+                ob, _ = vecenv.reset()
+                scenarios = vecenv.get_state()
+                num_in_batch = len(scenarios)
+                remaining = num_scenarios - scenarios_processed - num_in_batch
+                target_env.batch_size_eval = max(1, remaining)
+                if args["train"]["use_rnn"]:
+                    state["lstm_h"].zero_()
+                    state["lstm_c"].zero_()
+                for _ in range(max_steps):
+                    with torch.no_grad():
+                        ob_t = torch.as_tensor(ob).to(device)
+                        logits, _ = policy.forward_eval(ob_t, state)
+                        action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
+                        action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+                    if isinstance(logits, torch.distributions.Normal):
+                        action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
+                    ob, _, _, _, _ = vecenv.step(action)
+                    for e in range(num_in_batch):
+                        target_env.render(env_idx=e, view_mode=view_idx)
+                for e in range(num_in_batch):
+                    target_env.close_client(env_idx=e)
+                scenarios_processed += num_in_batch
+        finally:
+            os.chdir(saved_cwd)
+
+        return sorted(p for p in out_dir.glob("*.mp4"))
+
+
+_VIEW_NAME_TO_IDX = {
+    "sim_state": 0,
+    "bev": 1,
+    "topdown_sim": 2,
+    "bev_all": 3,
+}
diff --git a/pufferlib/ocean/benchmark/evaluators/wosac.py b/pufferlib/ocean/benchmark/evaluators/wosac.py
new file mode 100644
index 000000000..8733c8a2a
--- /dev/null
+++ b/pufferlib/ocean/benchmark/evaluators/wosac.py
@@ -0,0 +1,42 @@
+"""WOSACEvaluator — Waymo Open Sim Agents Challenge realism eval.
+
+Wraps the existing WOSACEvaluator class in benchmark/evaluator.py — that
+file owns the realism math (per-feature likelihood under learned
+estimators) and the per-scene multi-rollout structure. This adapter
+fits it into the unified Evaluator interface.
+"""
+
+from typing import ClassVar
+
+from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+
+
+class WOSACEvaluator(Evaluator):
+    type_name: ClassVar[str] = "wosac"
+
+    def env_overrides(self) -> dict:
+        env = {
+            "control_mode": "control_wosac",
+            "init_mode": "create_all_valid",
+            "eval_mode": 1,
+            "termination_mode": 0,
+            "reward_randomization": False,
+        }
+        env.update(self.config.get("env", {}))
+        return env
+
+    def rollout(self, vecenv, policy, args) -> EvalResult:
+        # Inner class pulls pandas/matplotlib — keep the import inside the
+        # rollout so the wrapper class can be imported in environments
+        # that don't have those (e.g. unit-test smoke envs).
+        from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator as _WOSACInner
+
+        inner = _WOSACInner(args)
+        df = inner.evaluate(args, vecenv, policy)
+        # df has one row per scene; aggregate to a single dict.
+        results = df.mean(numeric_only=True).to_dict()
+        results["total_num_agents"] = float(df["num_agents_per_scene"].sum())
+        results["total_unique_scenarios"] = float(df.index.unique().shape[0])
+        results["realism_meta_score_std"] = float(df["realism_meta_score"].std())
+        results = {k: (float(v) if hasattr(v, "item") else v) for k, v in results.items()}
+        return EvalResult(metrics=results, frames=[])
diff --git a/pufferlib/ocean/benchmark/manager.py b/pufferlib/ocean/benchmark/manager.py
new file mode 100644
index 000000000..86dedaacc
--- /dev/null
+++ b/pufferlib/ocean/benchmark/manager.py
@@ -0,0 +1,269 @@
+"""EvalManager — discovers `[eval.<name>]` sections, instantiates Evaluators,
+dispatches them inline or as subprocesses, logs results.
+
+Config schema (see docs/eval_unification.md):
+
+    [eval.<name>]
+    type = "<registered_type>"
+    enabled = true|false
+    interval = <epochs>
+    mode = "inline" | "subprocess"
+    inherits = "<other_eval_name>"      # optional, recursive merge
+    clean = true|false
+    render = true|false
+    render_views = ["sim_state", ...]
+    env.<key> = <value>                 # any [env] override
+    eval.<key> = <value>                # evaluator-specific knobs
+    vec.<key> = <value>                 # any [vec] override
+
+Sections without a `type` field are templates (only usable via `inherits`).
+"""
+
+import copy
+import importlib
+import json
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import pufferlib
+
+from pufferlib.ocean.benchmark.evaluators import EVALUATOR_REGISTRY, EvalResult, Evaluator
+
+# clean_eval macro — env knobs to zero/enforce. Per-section explicit values
+# win over the macro (see _build_section_config).
+CLEAN_EVAL_OVERRIDES = {
+    "lane_segment_dropout": 0.0,
+    "boundary_segment_dropout": 0.0,
+    "partner_blindness_prob": 0.0,
+    "phantom_braking_prob": 0.0,
+    "phantom_braking_trigger_prob": 0.0,
+    "traffic_light_behavior": 1,
+}
+
+
+class EvalManager:
+    def __init__(self, evaluators: list, train_config: dict):
+        self.evaluators = evaluators
+        self.train_config = train_config
+
+    @classmethod
+    def from_config(cls, train_config: dict) -> "EvalManager":
+        sections = _discover_eval_sections(train_config)
+        evaluators = []
+        for name, raw in sections.items():
+            cfg = _build_section_config(name, raw, sections)
+            type_name = cfg.get("type")
+            if type_name is None:
+                # Template section — referenced via inherits but not instantiated.
+                continue
+            cls_for_type = EVALUATOR_REGISTRY.get(type_name)
+            if cls_for_type is None:
+                raise ValueError(
+                    f"[eval.{name}] type='{type_name}' is not registered. "
+                    f"Known types: {sorted(EVALUATOR_REGISTRY.keys())}"
+                )
+            evaluators.append(cls_for_type(name=name, config=cfg, train_config=train_config))
+        return cls(evaluators=evaluators, train_config=train_config)
+
+    def maybe_run(self, epoch: int, policy, env_name: str, logger=None, global_step=None) -> dict:
+        """Called from the training loop. Runs every enabled evaluator
+        whose `interval` divides `epoch`. Returns a dict of {eval_name → metrics}."""
+        results = {}
+        for ev in self.evaluators:
+            if not ev.enabled:
+                continue
+            if ev.interval <= 0:
+                continue
+            if epoch % ev.interval != 0:
+                continue
+            res = self._run_one(ev, policy=policy, env_name=env_name, logger=logger, global_step=global_step)
+            results[ev.name] = res
+        return results
+
+    def run_one_by_name(self, name: str, policy, env_name: str, logger=None, global_step=None) -> EvalResult:
+        """Run a single named evaluator regardless of interval. Used for
+        the subprocess CLI entry and for standalone `puffer eval --evaluator <name>`."""
+        for ev in self.evaluators:
+            if ev.name == name:
+                return self._run_one(ev, policy=policy, env_name=env_name, logger=logger, global_step=global_step)
+        raise KeyError(f"No evaluator named '{name}'. Known: {[e.name for e in self.evaluators]}")
+
+    def _run_one(self, ev: Evaluator, policy, env_name: str, logger, global_step) -> EvalResult:
+        if ev.mode == "subprocess":
+            res = self._run_subprocess(ev, env_name=env_name, global_step=global_step)
+        else:
+            res = self._run_inline(ev, policy=policy, env_name=env_name, global_step=global_step)
+        if logger is not None:
+            self._log(ev, res, logger=logger, global_step=global_step)
+        if hasattr(ev, "cleanup"):
+            ev.cleanup()
+        return res
+
+    def _run_inline(self, ev: Evaluator, policy, env_name: str, global_step) -> EvalResult:
+        args = self._build_eval_args(ev, env_name=env_name, global_step=global_step)
+
+        package = args.get("package", "ocean")
+        module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}"
+        env_module = importlib.import_module(module_name)
+        make_env = env_module.env_creator(env_name)
+
+        vec_kwargs = ev.vec_overrides()
+        num_envs = int(vec_kwargs.get("num_envs", 1))
+        env_kwargs_list = [args["env"] for _ in range(num_envs)]
+        env_creators = [make_env] * num_envs
+        env_args_list = [[]] * num_envs
+
+        vec_call_kwargs = dict(vec_kwargs)
+        vec_call_kwargs.setdefault("num_workers", num_envs)
+        vec_call_kwargs.setdefault("batch_size", num_envs)
+
+        vecenv = pufferlib.vector.make(
+            env_creators, env_args=env_args_list, env_kwargs=env_kwargs_list, **vec_call_kwargs
+        )
+        try:
+            res = ev.rollout(vecenv, policy, args)
+        finally:
+            vecenv.close()
+        return res
+
+    def _run_subprocess(self, ev: Evaluator, env_name: str, global_step) -> EvalResult:
+        out_path = Path(self.train_config.get("data_dir", ".")) / "eval_subprocess_out" / f"{ev.name}.json"
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        cfg_path = out_path.with_suffix(".cfg.json")
+        with open(cfg_path, "w") as f:
+            json.dump({"name": ev.name, "global_step": global_step}, f)
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "pufferlib.pufferl",
+            "eval",
+            env_name,
+            "--evaluator",
+            ev.name,
+            "--out",
+            str(out_path),
+        ]
+        # Subprocess inherits the same checkpoint via train_config.load_model_path.
+        if self.train_config.get("load_model_path"):
+            cmd += ["--load-model-path", self.train_config["load_model_path"]]
+        subprocess.run(cmd, check=True)
+        with open(out_path) as f:
+            payload = json.load(f)
+        return EvalResult(metrics=payload.get("metrics", {}), frames=payload.get("frames", []))
+
+    def _build_eval_args(self, ev: Evaluator, env_name: str, global_step) -> dict:
+        args = copy.deepcopy(self.train_config)
+        args["env"].update(ev.env_overrides())
+        args.setdefault("vec", {})
+        args["vec"].update(ev.vec_overrides())
+        args["env_name"] = env_name
+        args["global_step"] = global_step
+        args["seed"] = int(self.train_config.get("train", {}).get("seed", 42)) or 42
+        # Pass through evaluator-private fields that subclasses look up on args.
+        ev_eval = ev.config.get("eval", {})
+        if ev_eval:
+            args.setdefault("eval", {})
+            args["eval"].update(ev_eval)
+        return args
+
+    def _log(self, ev: Evaluator, result: EvalResult, logger, global_step):
+        if not result.metrics and not result.frames:
+            return
+        log_dict = {f"{ev.name}/{k}": float(v) for k, v in result.metrics.items() if isinstance(v, (int, float))}
+        if hasattr(logger, "local_writer") and logger.local_writer and global_step is not None:
+            for k, v in log_dict.items():
+                logger.local_writer.add_scalar(k, v, global_step)
+        if hasattr(logger, "log") and log_dict:
+            if global_step is not None:
+                logger.log(log_dict, global_step)
+            else:
+                logger.log(log_dict)
+        if result.frames and hasattr(logger, "log"):
+            try:
+                import wandb
+
+                videos = [
+                    wandb.Video(str(p), fps=30, format="mp4", caption=Path(p).stem)
+                    for p in result.frames
+                    if str(p).endswith(".mp4")
+                ]
+                if videos:
+                    payload = {f"{ev.name}/render": videos if len(videos) > 1 else videos[0]}
+                    if global_step is not None:
+                        logger.log(payload, global_step)
+                    else:
+                        logger.log(payload)
+            except ImportError:
+                pass
+
+
+def _discover_eval_sections(args: dict) -> dict:
+    """Pull `[eval.<name>]` sections out of the parsed config.
+
+    `load_config` flattens dotted section names into a nested dict. So
+    `[eval.foo]` becomes `args["eval"]["foo"]`. We collect every direct
+    child of `args["eval"]` that's itself a dict and treat it as a section."""
+    eval_root = args.get("eval", {})
+    if not isinstance(eval_root, dict):
+        return {}
+    sections = {}
+    for name, body in eval_root.items():
+        if isinstance(body, dict):
+            sections[name] = body
+    return sections
+
+
+def _build_section_config(name: str, raw: dict, all_sections: dict) -> dict:
+    """Resolve `inherits` chain + `clean` macro + dotted-key flattening."""
+    chain = []
+    current_name = name
+    current_raw = raw
+    visited = set()
+    while True:
+        if current_name in visited:
+            raise ValueError(f"Cyclic 'inherits' chain involving [eval.{current_name}]")
+        visited.add(current_name)
+        chain.append(current_raw)
+        parent_name = current_raw.get("inherits")
+        if parent_name is None:
+            break
+        if parent_name not in all_sections:
+            raise ValueError(f"[eval.{current_name}].inherits='{parent_name}' is not a known section")
+        current_name = parent_name
+        current_raw = all_sections[parent_name]
+
+    merged = {}
+    for level in reversed(chain):
+        _deep_merge(merged, _expand_dotted(level))
+
+    if merged.get("clean", True):
+        env_section = merged.setdefault("env", {})
+        for k, v in CLEAN_EVAL_OVERRIDES.items():
+            env_section.setdefault(k, v)
+
+    return merged
+
+
+def _expand_dotted(raw: dict) -> dict:
+    """`{"env.simulation_mode": "replay"}` → `{"env": {"simulation_mode": "replay"}}`."""
+    out = {}
+    for k, v in raw.items():
+        if "." in k:
+            head, _, tail = k.partition(".")
+            sub = out.setdefault(head, {})
+            sub[tail] = v
+        else:
+            out[k] = v
+    return out
+
+
+def _deep_merge(dst: dict, src: dict):
+    for k, v in src.items():
+        if isinstance(v, dict) and isinstance(dst.get(k), dict):
+            _deep_merge(dst[k], v)
+        else:
+            dst[k] = v
diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py
new file mode 100644
index 000000000..064463cc7
--- /dev/null
+++ b/tests/test_eval_manager.py
@@ -0,0 +1,122 @@
+"""Smoke tests for EvalManager config parsing.
+
+Doesn't load the full pufferl.py module (which pulls heavy training deps).
+Just verifies the inheritance + clean macro + dotted-key expansion logic
+behaves as the design doc says.
+"""
+
+import os
+import sys
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from pufferlib.ocean.benchmark.manager import (
+    CLEAN_EVAL_OVERRIDES,
+    EvalManager,
+    _build_section_config,
+    _expand_dotted,
+)
+
+
+def test_dotted_expand():
+    raw = {"env.simulation_mode": "replay", "interval": 25}
+    out = _expand_dotted(raw)
+    assert out == {"env": {"simulation_mode": "replay"}, "interval": 25}
+
+
+def test_inheritance_chain():
+    sections = {
+        "behaviors_defaults": {
+            "type": "behavior_class",
+            "interval": 250,
+            "env.simulation_mode": "replay",
+            "env.scenario_length": 201,
+        },
+        "behaviors_hard_stop": {
+            "inherits": "behaviors_defaults",
+            "env.map_dir": "/tmp/hard_stop",
+        },
+    }
+    cfg = _build_section_config("behaviors_hard_stop", sections["behaviors_hard_stop"], sections)
+    assert cfg["type"] == "behavior_class"
+    assert cfg["interval"] == 250
+    assert cfg["env"]["simulation_mode"] == "replay"
+    assert cfg["env"]["scenario_length"] == 201
+    assert cfg["env"]["map_dir"] == "/tmp/hard_stop"
+
+
+def test_inheritance_child_wins():
+    sections = {
+        "parent": {"interval": 250, "env.scenario_length": 201},
+        "child": {"inherits": "parent", "interval": 100, "env.scenario_length": 91},
+    }
+    cfg = _build_section_config("child", sections["child"], sections)
+    assert cfg["interval"] == 100
+    assert cfg["env"]["scenario_length"] == 91
+
+
+def test_inheritance_cycle_detected():
+    sections = {
+        "a": {"inherits": "b"},
+        "b": {"inherits": "a"},
+    }
+    with pytest.raises(ValueError, match="Cyclic"):
+        _build_section_config("a", sections["a"], sections)
+
+
+def test_inheritance_unknown_parent():
+    sections = {
+        "child": {"inherits": "nonexistent"},
+    }
+    with pytest.raises(ValueError, match="not a known section"):
+        _build_section_config("child", sections["child"], sections)
+
+
+def test_clean_macro_applied_by_default():
+    sections = {"foo": {"type": "multi_scenario"}}
+    cfg = _build_section_config("foo", sections["foo"], sections)
+    for k, v in CLEAN_EVAL_OVERRIDES.items():
+        assert cfg["env"][k] == v
+
+
+def test_clean_macro_disabled_when_clean_false():
+    sections = {"foo": {"type": "multi_scenario", "clean": False}}
+    cfg = _build_section_config("foo", sections["foo"], sections)
+    for k in CLEAN_EVAL_OVERRIDES:
+        assert k not in cfg.get("env", {})
+
+
+def test_clean_macro_loses_to_explicit_override():
+    sections = {
+        "foo": {
+            "type": "multi_scenario",
+            "env.lane_segment_dropout": 0.5,  # explicit > macro default of 0.0
+        }
+    }
+    cfg = _build_section_config("foo", sections["foo"], sections)
+    assert cfg["env"]["lane_segment_dropout"] == 0.5
+
+
+def test_manager_from_config_skips_template_sections():
+    train_config = {
+        "eval": {
+            "behaviors_defaults": {"interval": 250, "env.scenario_length": 201},
+            "behaviors_hard_stop": {
+                "type": "behavior_class",
+                "inherits": "behaviors_defaults",
+                "env.map_dir": "/tmp/hard_stop",
+            },
+        },
+    }
+    mgr = EvalManager.from_config(train_config)
+    names = [e.name for e in mgr.evaluators]
+    assert "behaviors_hard_stop" in names
+    assert "behaviors_defaults" not in names  # template, no `type` field
+
+
+def test_manager_unknown_type_raises():
+    train_config = {"eval": {"foo": {"type": "totally_made_up"}}}
+    with pytest.raises(ValueError, match="not registered"):
+        EvalManager.from_config(train_config)

From 7218f5c11e3fe700f9022502e7d2daecaa973224 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@Eugenes-MacBook-Air.local>
Date: Sat, 9 May 2026 18:30:09 -0400
Subject: [PATCH 02/26] [WIP] eval: replace [eval] with [eval.<name>] sections
 in drive.ini
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Auto-discovered evaluator sections, schema A (dotted keys). Each
behavior class is its own section inheriting from behaviors_defaults
template; gigaflow validation has separate metric-only and render
sections at different intervals.

driving_behaviours_eval.ini deleted — folded into drive.ini.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini              | 221 +++++++++++++-----
 .../config/ocean/driving_behaviours_eval.ini  |  64 -----
 2 files changed, 160 insertions(+), 125 deletions(-)
 delete mode 100644 pufferlib/config/ocean/driving_behaviours_eval.ini

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index bafc523b2..d6d62a025 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -190,67 +190,166 @@ show_human_logs = True
 ; Options: List[str to path], str to path (e.g., "resources/drive/training/binaries/map_001.bin"), None
 render_map = none
 
-[eval]
-; Set to True to enable periodic multi-scenario evaluation during training
-multi_scenario_eval = False
-; Set to True to enable periodic multi-scenario render during training (one
-; rollout per scenario, output mp4 per scenario via the EGL render pipeline
-; or HTML replay via viz.generate_interactive_replay depending on
-; multi_scenario_render_backend). Does not affect multi_scenario_eval.
-multi_scenario_render = True
-; Epoch interval between render runs. Independent of eval_interval so metric
-; eval can run on a tighter schedule than the more expensive render.
-multi_scenario_render_interval = 250
-; Render backend for multi_scenario_render: "html" (CPU, viz.generate_interactive_replay)
-; or "egl" (C-side render.h → EGL → PBO → ffmpeg libx264, one mp4 per scenario).
-multi_scenario_render_backend = egl
-; Frequency of evaluation during training (in epochs)
-eval_interval = 25
-; When True, inline eval zeroes road-segment dropout + perturbations and
-; enforces red-light stops. Metrics then reflect performance under clean
-; conditions. The live training policy is re-aligned to the clean env's
-; obs shape via _swap_policy_obs_counts — safe because the GigaFlow
-; encoder is count-invariant (shared MLP + max-pool over segments).
-clean_eval = True
-num_agents = 512
-; Batch size for eval_multi_scenarios (number of scenarios per batch)
-; Path to dataset used for evaluation
-map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
-; Simulation mode for evaluation: "gigaflow" or "replay"
-;   gigaflow — procedurally spawn agents on CARLA towns (needs map-only .bin
-;              files in pufferlib/resources/drive/binaries/carla_py123d)
-;   replay   — play logged trajectories from WOMD/nuPlan scenarios (needs
-;              trajectory-bearing .bin files in pufferlib/resources/drive/binaries/womd)
-multi_scenario_simulation_mode = "gigaflow"
-; Total number of scenarios to evaluate
-multi_scenario_num_scenarios = 250
-; Per-scenario step count for replay-mode eval (also used as resample_frequency).
-; 91 = WOMD (9.1s @ 10Hz). 201 = nuPlan (20.1s @ 10Hz). Ignored for gigaflow
-; mode, which always uses a hardcoded 3000-step procedural episode.
-scenario_length = 201
-; Cap the render rollout at this many steps.
-render_max_steps = 201
-backend = PufferEnv
-; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
-; If True, enables evaluation on realism metrics each time we save a checkpoint
-wosac_realism_eval = False
-wosac_num_rollouts = 32  ; Number of policy rollouts per scene
-wosac_init_steps = 10 ; When to start the simulation
-wosac_num_agents = 256  ; Total number of WOSAC agents to evaluate
-wosac_control_mode = "control_wosac"  ; Control the tracks to predict
-wosac_init_mode = "create_all_valid"  ; Initialize from the tracks to predict
-wosac_goal_radius = 2.0 ; Can shrink goal radius for WOSAC evaluation
-wosac_sanity_check = False
-wosac_aggregate_results = True ; Only return aggregate results across all scenes
-; If True, enable human replay evaluation (pair policy-controlled agent with human replays)
-human_replay_eval = False
-human_replay_control_mode = "control_sdc_only" ; Control only the self-driving car
-human_replay_num_agents = 64 ; This equals the number of scenarios, since we control one agent in each
-; Evaluating different driving behaviours learned by the policy
-driving_behaviours_eval = True
-driving_behaviours_eval_config = "pufferlib/config/ocean/driving_behaviours_eval.ini"
-driving_behaviours_eval_interval = 250
-render_driving_behaviours = True
+; ===========================================================================
+; Evaluation suites
+;
+; Each [eval.<name>] section is one Evaluator instance. EvalManager discovers
+; them via auto-discovery (any section under [eval] with a `type` field).
+; Sections without a `type` field are templates — referenced from other
+; sections via `inherits = "<template_name>"`.
+;
+; Field reference:
+;   type           — registered evaluator class (multi_scenario, behavior_class,
+;                    human_replay, wosac)
+;   enabled        — true|false
+;   interval       — epochs between runs (0 disables)
+;   mode           — "inline" (block training) | "subprocess" (spawn process)
+;   inherits       — pull defaults from another section, recursively
+;   clean          — true → zero perturbations + dropout + enforce red lights
+;   render         — true → capture mp4(s) during rollout
+;   render_views   — list of camera views: sim_state, bev, topdown_sim, bev_all
+;   env.<key>      — any [env] override (dotted key)
+;   eval.<key>     — evaluator-specific knob (e.g. num_scenarios)
+;   vec.<key>      — any [vec] override
+; ===========================================================================
+
+[eval.validation_gigaflow]
+type = "multi_scenario"
+enabled = true
+interval = 25
+mode = "inline"
+clean = true
+render = false
+env.simulation_mode = "gigaflow"
+env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
+env.num_maps = 8
+env.num_agents = 512
+env.min_agents_per_env = 50
+env.max_agents_per_env = 50
+env.scenario_length = 3000
+env.resample_frequency = 3000
+eval.num_scenarios = 250
+
+[eval.validation_gigaflow_render]
+inherits = "validation_gigaflow"
+enabled = true
+interval = 250
+render = true
+render_views = ["sim_state", "bev"]
+eval.num_scenarios = 5
+
+; ---------------------------------------------------------------------------
+; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each
+; behavior is one [eval.behaviors_*] section. All inherit from the template
+; below — change shared knobs in one place.
+; ---------------------------------------------------------------------------
+
+[eval.behaviors_defaults]
+; Template — no `type`, never instantiated directly. Other sections inherit.
+enabled = false
+interval = 250
+mode = "inline"
+clean = true
+render = true
+render_views = ["sim_state", "bev"]
+env.simulation_mode = "replay"
+env.control_mode = "control_sdc_only"
+env.init_mode = "create_all_valid"
+env.scenario_length = 201
+env.max_partner_observations = 32
+eval.num_scenarios = 50
+
+[eval.behaviors_full_dir]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/nuplan_mini_train_bins"
+
+[eval.behaviors_hard_stop]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/hard_stop"
+
+[eval.behaviors_highway_straight]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/highway_straight"
+
+[eval.behaviors_lane_change]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/lane_change"
+
+[eval.behaviors_merge]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/merge"
+
+[eval.behaviors_parked_cars]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/parked_cars"
+
+[eval.behaviors_roundabout]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/roundabout"
+
+[eval.behaviors_stopped_traffic]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/stopped_traffic"
+
+[eval.behaviors_traffic_light_green]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_green"
+
+[eval.behaviors_traffic_light_stop]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_stop"
+
+[eval.behaviors_unprotected_left]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_left"
+
+[eval.behaviors_unprotected_right]
+inherits = "behaviors_defaults"
+type = "behavior_class"
+enabled = true
+env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_right"
+
+; ---------------------------------------------------------------------------
+; Optional: WOSAC realism eval. Off by default.
+; ---------------------------------------------------------------------------
+
+[eval.wosac]
+type = "wosac"
+enabled = false
+interval = 500
+mode = "subprocess"
+clean = true
+render = false
+env.control_mode = "control_wosac"
+env.init_mode = "create_all_valid"
+env.init_steps = 10
+env.goal_radius = 2.0
+eval.wosac_num_rollouts = 32
+eval.wosac_num_agents = 256
+eval.wosac_sanity_check = false
+eval.wosac_aggregate_results = true
 
 ; [sweep.train.learning_rate]
 ; distribution = log_normal
diff --git a/pufferlib/config/ocean/driving_behaviours_eval.ini b/pufferlib/config/ocean/driving_behaviours_eval.ini
deleted file mode 100644
index 02896db99..000000000
--- a/pufferlib/config/ocean/driving_behaviours_eval.ini
+++ /dev/null
@@ -1,64 +0,0 @@
-; Configuration for driving behaviour evaluation using nuPlan mini-train
-; scenarios labeled by scene type. Built from py123d 0.2.1 reconvert of
-; .bin files under /scratch/ev2237/data/nuplan/categories_v021/<class>.
-;
-; Eval runs in REPLAY mode (simulation_mode=replay, control_mode=control_sdc_only)
-; using the same reward weights as training (no reward conditioning). Scenario
-; length is 201 (nuPlan with duration_s=20 at 10Hz → 20.1s).
-;
-; Categories with an empty folder are omitted — driving_behaviours_eval errors
-; if map_dir has no .bin files. Add new categories by labeling more scenes
-; (see scripts/render_scenario.py --view bev) and copying them into the
-; corresponding /scratch/ev2237/data/nuplan/categories_v021/<class>/ folder.
-
-[eval_full_dir]
-map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins"
-scenario_length = 201
-; Random-sample this many bins from map_dir each eval pass (fresh sample
-; per pass). Cap keeps wall-clock manageable; 876-bin full sweep would
-; take ~25 min, 50 bins takes ~1.5 min.
-num_scenarios = 50
-
-[eval_hard_stop]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop"
-scenario_length = 201
-
-[eval_highway_straight]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight"
-scenario_length = 201
-
-[eval_lane_change]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change"
-scenario_length = 201
-
-[eval_merge]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge"
-scenario_length = 201
-
-[eval_parked_cars]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars"
-scenario_length = 201
-
-[eval_roundabout]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout"
-scenario_length = 201
-
-[eval_stopped_traffic]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic"
-scenario_length = 201
-
-[eval_traffic_light_green]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green"
-scenario_length = 201
-
-[eval_traffic_light_stop]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop"
-scenario_length = 201
-
-[eval_unprotected_left]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left"
-scenario_length = 201
-
-[eval_unprotected_right]
-map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right"
-scenario_length = 201

From 0360a83f2429f5a6f7930e663fcb3249679aaa83 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@Eugenes-MacBook-Air.local>
Date: Sat, 9 May 2026 18:30:23 -0400
Subject: [PATCH 03/26] [WIP] eval: rip out legacy eval functions, wire
 EvalManager into PuffeRL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed from pufferl.py (~990 lines):
  eval(), eval_multi_scenarios(), eval_multi_scenarios_render(),
  build_eval_overrides(), load_eval_multi_scenarios_config(),
  _swap_policy_obs_counts(), _render_driving_behaviours(),
  _export_metrics(), _log_eval_metrics(),
  verify_scenario_coverage(), verify_scenario_coverage_gigaflow().
  Plus the legacy eval block in PuffeRL.evaluate() and the
  driving_behaviours_eval.ini loader in load_config.

Removed from utils.py (~300 lines):
  run_human_replay_eval_in_subprocess,
  run_wosac_eval_in_subprocess,
  run_driving_behaviours_eval_in_subprocess.

PuffeRL.evaluate() now calls self._eval_manager.maybe_run() — single
unified path for all evals. main() wires `puffer eval --evaluator <name>
--out <path>` for both standalone and subprocess use.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 1355 +++---------------------------------------
 pufferlib/utils.py   |  303 ----------
 2 files changed, 80 insertions(+), 1578 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index d501a2bc4..7176c9759 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -7,8 +7,6 @@
 import numbers
 import warnings
 
-import pandas as pd
-
 
 warnings.filterwarnings("error", category=RuntimeWarning)
 
@@ -257,6 +255,8 @@ def __init__(self, config, vecenv, policy, logger=None):
         self.losses = {}
         self.best_score = -float("inf")
         self.ema_max = 0.0
+        # Set later via PuffeRL.attach_eval_manager (before evaluate() fires).
+        self._eval_manager = None
 
         # Dashboard
         self.model_size = sum(p.numel() for p in policy.parameters() if p.requires_grad)
@@ -457,198 +457,18 @@ def train(self):
                     except Exception as e:
                         print(f"Failed to export model weights: {e}")
 
-        if self.config["eval"]["wosac_realism_eval"] and (
-            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
-        ):
-            pufferlib.utils.run_wosac_eval_in_subprocess(self.config, self.logger, self.global_step)
-
-        if self.config["eval"]["human_replay_eval"] and (
-            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
-        ):
-            pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step)
-
-        if self.config["eval"]["wosac_realism_eval"] and (
-            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
-        ):
-            pufferlib.utils.run_wosac_eval_in_subprocess(self.config, self.logger, self.global_step)
-
-        if self.config["eval"]["human_replay_eval"] and (
-            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
-        ):
-            pufferlib.utils.run_human_replay_eval_in_subprocess(self.config, self.logger, self.global_step)
-
-        behaviours_eval_enabled = self.config["eval"].get("driving_behaviours_eval", False)
-        behaviours_eval_interval = int(self.config["eval"].get("driving_behaviours_eval_interval", 25))
-        behaviours_config = self.config.get("driving_behaviours_eval")
-        if (
-            behaviours_eval_enabled
-            and behaviours_config
-            and behaviours_eval_interval > 0
-            and (self.epoch % behaviours_eval_interval == 0 or done_training)
-        ):
-            self.save_checkpoint()
-            pufferlib.utils.run_driving_behaviours_eval_in_subprocess(
-                self.config, self.logger, self.global_step, behaviours_config
-            )
-            if self.config["eval"].get("render_driving_behaviours"):
-                self._render_driving_behaviours(behaviours_config)
-
-        if self.config["eval"]["multi_scenario_eval"] and (
-            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
-        ):
-            # Get evaluation settings from config
-            eval_simulation_mode = self.config["eval"]["multi_scenario_simulation_mode"]
-            num_agents_eval = self.config["eval"]["num_agents"]
-            map_dir = self.config["eval"]["map_dir"]
-
-            # Inline eval runs "clean" by default — perturbations + dropout off,
-            # red-light stops enforced — so the logged validation metrics
-            # track progress under controlled conditions rather than noisy
-            # training perturbations. The live training policy's road slicing
-            # is re-aligned to the clean env at eval time via
-            # _swap_policy_obs_counts inside eval_multi_scenarios.
-            clean_eval = self.config["eval"].get("clean_eval", True)
-            eval_overrides = build_eval_overrides(
-                simulation_mode=eval_simulation_mode,
-                num_agents=num_agents_eval,
-                num_scenarios=self.config["eval"]["multi_scenario_num_scenarios"],
-                map_dir=map_dir,
-                num_carla_maps=self.config["eval"].get("num_carla_maps", 8),
-                clean=clean_eval,
-                scenario_length=self.config["eval"].get("scenario_length"),
-            )
-
-            # Build eval args by applying overrides to training config
-            eval_args = load_eval_multi_scenarios_config(
+        # All evaluation is now driven by the unified EvalManager. Each
+        # [eval.<name>] section in drive.ini is one evaluator instance;
+        # the manager fires any whose interval divides this epoch. See
+        # docs/eval_unification.md for the design.
+        if self._eval_manager is not None:
+            self._eval_manager.maybe_run(
+                epoch=self.epoch,
+                policy=self.uncompiled_policy,
                 env_name=self.config["env"],
-                model_path=None,  # No saved model - using current policy in memory
-                eval_overrides=eval_overrides,
-            )
-            # Add inline-specific settings
-            eval_args["global_step"] = self.global_step  # Log by global step for TensorBoard
-            eval_args["num_scenarios"] = self.config["eval"]["multi_scenario_num_scenarios"]
-            eval_args["eval_simulation"] = eval_simulation_mode
-
-            # Mark this as inline evaluation and set results folder in experiments
-            eval_args["inline_eval"] = True  # Flag to indicate inline evaluation during training
-            experiment_name = f"{self.config['env']}_{self.logger.run_id}"
-            eval_args["load_model_path"] = os.path.join(
-                self.config["data_dir"], experiment_name, "models", f"inline_epoch_{self.epoch}.pt"
+                logger=self.logger,
+                global_step=self.global_step,
             )
-            # For inline eval, results go in experiments folder instead of benchmark
-            eval_args["eval_results_dir"] = os.path.join(
-                self.config["data_dir"],
-                experiment_name,
-                "validation",
-                f"epoch_{self.epoch}",
-                self.config["eval"]["multi_scenario_simulation_mode"],
-            )
-
-            # Call eval_multi_scenarios inline with current policy and logger
-            print(f"\n🔄 Running multi-scenario evaluation at step {self.global_step}...")
-            eval_multi_scenarios(
-                env_name=self.config["env"],
-                args=eval_args,
-                vecenv=None,  # Let it create its own eval environment
-                policy=self.uncompiled_policy,  # Pass current policy
-                logger=self.logger,  # Pass logger for TensorBoard logging
-                metric_prefix="validation",  # Use validation_ prefix
-                quiet=True,  # Suppress verbose output during inline eval
-                clean=clean_eval,
-            )
-
-        # Multi-scenario render — independent interval so the heavier render
-        # path doesn't have to fire every eval_interval. Mirrors the block
-        # above but calls eval_multi_scenarios_render with render=True and
-        # the configured backend ("egl" by default on this branch, writes
-        # one mp4 per scenario via the C render.h pipeline).
-        if self.config["eval"]["multi_scenario_render"] and (
-            self.epoch % self.config["eval"]["multi_scenario_render_interval"] == 0 or done_training
-        ):
-            render_simulation_mode = self.config["eval"]["multi_scenario_simulation_mode"]
-            num_agents_render = self.config["eval"]["num_agents"]
-            render_map_dir = self.config["eval"]["map_dir"]
-            clean_render = self.config["eval"].get("clean_eval", True)
-
-            render_overrides = build_eval_overrides(
-                simulation_mode=render_simulation_mode,
-                num_agents=num_agents_render,
-                num_scenarios=self.config["eval"]["multi_scenario_num_scenarios"],
-                map_dir=render_map_dir,
-                num_carla_maps=self.config["eval"].get("num_carla_maps", 8),
-                clean=clean_render,
-                scenario_length=self.config["eval"].get("scenario_length"),
-            )
-
-            render_args = load_eval_multi_scenarios_config(
-                env_name=self.config["env"],
-                model_path=None,
-                eval_overrides=render_overrides,
-            )
-            render_args["global_step"] = self.global_step
-            render_args["num_scenarios"] = self.config["eval"]["multi_scenario_num_scenarios"]
-            render_args["eval_simulation"] = render_simulation_mode
-            render_args["render"] = True  # master on/off for the render branch
-            render_args["render_obs"] = False  # HTML-only; EGL path ignores
-
-            render_args["inline_eval"] = True
-            experiment_name = f"{self.config['env']}_{self.logger.run_id}"
-            render_args["load_model_path"] = os.path.join(
-                self.config["data_dir"], experiment_name, "models", f"inline_epoch_{self.epoch}.pt"
-            )
-            render_args["eval_results_dir"] = os.path.join(
-                self.config["data_dir"],
-                experiment_name,
-                "renders",
-                f"epoch_{self.epoch:08d}",
-                self.config["eval"]["multi_scenario_simulation_mode"],
-            )
-
-            backend_name = self.config["eval"]["multi_scenario_render_backend"]
-            print(f"\n🎬 Running multi-scenario {backend_name} render at step {self.global_step}...")
-            # Render failures (missing map dir, corrupted .bin files, ffmpeg
-            # absent, EGL unavailable, etc.) should NEVER crash training — the
-            # render is a logging side-channel. Catch any exception here, log
-            # it, and let training keep going. The upstream eval_multi_scenarios
-            # metric call is separate and already ran, so metric eval continues
-            # to work even if video rendering is broken.
-            # Multi-view EGL render: run the full render fn once per view
-            # (sim_state then bev). Each call creates a fresh vecenv that
-            # starts at scenario 0, runs all scenarios with one camera, and
-            # tears down. Doing both views in ONE rollout would not work
-            # because Drive.step's resample fires at the last step and
-            # advances starting_map_counter — a re-reset would replay the
-            # NEXT batch instead of the original one.
-            _bev_views = [(0, "", "sim_state"), (1, "_bev", "bev")] if backend_name == "egl" else [(0, "", "sim_state")]
-            for _vmode, _vsuffix, _vlabel in _bev_views:
-                try:
-                    eval_multi_scenarios_render(
-                        env_name=self.config["env"],
-                        args=dict(render_args),
-                        vecenv=None,
-                        policy=self.uncompiled_policy,
-                        logger=self.logger,
-                        metric_prefix=f"render_{_vlabel}",
-                        quiet=True,
-                        render_backend=backend_name,
-                        view_mode=_vmode,
-                        video_suffix=_vsuffix,
-                        log_view_label=_vlabel,
-                        # Configurable cap: eval.render_max_steps. Default 50 until
-                        # the mystery ~500-c_render-call abort is properly diagnosed.
-                        # Set to 0/negative to disable the cap entirely.
-                        render_max_steps=(self.config["eval"].get("render_max_steps", 50) or None),
-                        clean=clean_render,
-                    )
-                except Exception as e:
-                    import traceback
-
-                    print(
-                        f"\n⚠️  multi_scenario_render failed (view={_vlabel}) at step {self.global_step}: "
-                        f"{type(e).__name__}: {e}"
-                    )
-                    traceback.print_exc()
-                    print("Training continues.")
 
         return logs
 
@@ -959,90 +779,6 @@ def mean_and_log(self):
         self.logger.log(logs, agent_steps)
         return logs
 
-    def _render_driving_behaviours(self, behaviours_config):
-        """Render one scenario per driving behaviour class using eval_multi_scenarios_render."""
-        import random as _random
-
-        EVAL_SECTIONS_PREFIX = "eval_"
-        backend_name = self.config["eval"].get("multi_scenario_render_backend", "egl")
-        bev_views = [(0, "", "sim_state"), (1, "_bev", "bev")] if backend_name == "egl" else [(0, "", "sim_state")]
-
-        for class_name, class_cfg in behaviours_config.items():
-            if not class_name.startswith(EVAL_SECTIONS_PREFIX):
-                continue
-            map_dir = class_cfg.get("map_dir", "")
-            if isinstance(map_dir, str):
-                map_dir = map_dir.strip('"').strip("'")
-            if not os.path.isdir(map_dir) or not any(f.endswith(".bin") for f in os.listdir(map_dir)):
-                continue
-
-            short = class_name[len(EVAL_SECTIONS_PREFIX) :]
-            num_maps = len([f for f in os.listdir(map_dir) if f.endswith(".bin")])
-            # Render under clean-eval conditions (zero dropout, zero
-            # perturbations, enforced red lights) so the mp4s show what
-            # the policy does under controlled eval, not the noisy
-            # training-time perturbations. Matches run_driving_behaviours
-            # _eval_in_subprocess, so the video matches the metric eval.
-            render_overrides = build_eval_overrides(
-                simulation_mode="replay",
-                num_agents=1,
-                num_scenarios=1,
-                map_dir=map_dir,
-                clean=True,
-            )
-            render_overrides["env"]["control_mode"] = "control_sdc_only"
-            render_overrides["env"]["num_maps"] = num_maps
-            render_overrides["env"]["scenario_length"] = class_cfg.get("scenario_length", 91)
-            # Pick a random starting map index so each render epoch shows a
-            # different scenario from the directory. Without this, the env
-            # picks scenario 0 every time and we'd always render the same
-            # first .bin alphabetically.
-            render_overrides["env"]["starting_map"] = _random.randint(0, num_maps - 1)
-
-            render_args = load_eval_multi_scenarios_config(
-                env_name=self.config["env"],
-                model_path=None,
-                eval_overrides=render_overrides,
-            )
-            experiment_name = f"{self.config['env']}_{self.logger.run_id}"
-            render_args["global_step"] = self.global_step
-            render_args["num_scenarios"] = 1
-            render_args["eval_simulation"] = "replay"
-            render_args["render"] = True
-            render_args["inline_eval"] = True
-            render_args["eval_results_dir"] = os.path.join(
-                self.config["data_dir"],
-                experiment_name,
-                "renders",
-                f"epoch_{self.epoch:08d}",
-                "driving_behaviours",
-                short,
-            )
-
-            for vmode, vsuffix, vlabel in bev_views:
-                try:
-                    eval_multi_scenarios_render(
-                        env_name=self.config["env"],
-                        args=dict(render_args),
-                        vecenv=None,
-                        policy=self.uncompiled_policy,
-                        logger=self.logger,
-                        metric_prefix=f"driving_behaviours/{short}",
-                        render_key_prefix=f"driving_behaviours/{short}/render/{vlabel}",
-                        quiet=True,
-                        render_backend=backend_name,
-                        view_mode=vmode,
-                        video_suffix=vsuffix,
-                        log_view_label=vlabel,
-                        render_max_steps=(self.config["eval"].get("render_max_steps", 50) or None),
-                        clean=True,
-                    )
-                except Exception as e:
-                    import traceback
-
-                    print(f"DrivingBehavioursRender [{short}] view={vlabel}: {type(e).__name__}: {e}")
-                    traceback.print_exc()
-
     def close(self):
         self.vecenv.close()
         self.utilization.stop()
@@ -1635,10 +1371,13 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, early_stop
         **args["train"],
         env=env_name,
         eval=args.get("eval", {}),
-        driving_behaviours_eval=args.get("driving_behaviours_eval"),
     )
     pufferl = PuffeRL(train_config, vecenv, policy, logger)
 
+    from pufferlib.ocean.benchmark.manager import EvalManager
+
+    pufferl._eval_manager = EvalManager.from_config(args)
+
     # Restore optimizer state + step counters when resuming from a checkpoint.
     # save_checkpoint writes models/model_<env>_<epoch>.pt and trainer_state.pt
     # (sibling of models/) — so trainer_state.pt is one dir above the .pt path.
@@ -1723,994 +1462,66 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, early_stop
     return all_logs
 
 
-def eval(env_name, args=None, vecenv=None, policy=None):
-    """Evaluate a policy."""
-
-    args = args or load_config(env_name)
-    args["env"]["termination_mode"] = 0
-
-    wosac_enabled = args["eval"]["wosac_realism_eval"]
-    human_replay_enabled = args["eval"]["human_replay_eval"]
-
-    if wosac_enabled:
-        args["env"]["map_dir"] = args["eval"]["map_dir"]
-        dataset_name = args["env"]["map_dir"].split("/")[-1]
-
-        print(f"Running WOSAC realism evaluation with {dataset_name} dataset.\n")
-        from pufferlib.ocean.benchmark.evaluator import WOSACEvaluator
-
-        backend = args["eval"]["backend"]
-        assert backend == "PufferEnv" or not wosac_enabled, "WOSAC evaluation only supports PufferEnv backend."
-
-        # Configure environment for WOSAC
-        args["vec"] = dict(backend=backend, num_envs=1)
-        args["env"]["init_mode"] = args["eval"]["wosac_init_mode"]
-        args["env"]["control_mode"] = args["eval"]["wosac_control_mode"]
-        args["env"]["init_steps"] = args["eval"]["wosac_init_steps"]
-        args["env"]["goal_behavior"] = args["eval"]["wosac_goal_behavior"]
-        args["env"]["goal_radius"] = args["eval"]["wosac_goal_radius"]
-
-        # Batch size configuration
-        num_scenes_per_batch = args["eval"]["wosac_batch_size"]
-        args["env"]["num_agents"] = num_scenes_per_batch * 10
-        args["env"]["num_maps"] = args["eval"]["wosac_scenario_pool_size"]
-
-        # Create environment and policy
-        vecenv = vecenv or load_env(env_name, args)
-        policy = policy or load_policy(args, vecenv, env_name)
-
-        # Make eval class instance
-        evaluator = WOSACEvaluator(args)
-
-        # Obtain scores
-        df_results = evaluator.evaluate(args, vecenv, policy)
-
-        # Average results over scenarios
-        results_dict = df_results.mean().to_dict()
-        results_dict["total_num_agents"] = df_results["num_agents_per_scene"].sum()
-        results_dict["total_unique_scenarios"] = df_results.index.unique().shape[0]
-        results_dict["realism_meta_score_std"] = df_results["realism_meta_score"].std()
-        results_dict = {k: v.item() if hasattr(v, "item") else v for k, v in results_dict.items()}
-
-        import json
-
-        print("\nWOSAC_METRICS_START")
-        print(json.dumps(results_dict))
-        print("WOSAC_METRICS_END")
-        vecenv.close()
-        return results_dict
-
-    elif human_replay_enabled:
-        args["env"]["map_dir"] = args["eval"]["map_dir"]
-        dataset_name = args["env"]["map_dir"].split("/")[-1]
-        print(f"Running human replay evaluation with {dataset_name} dataset.\n")
-        from pufferlib.ocean.benchmark.evaluator import HumanReplayEvaluator
-
-        backend = args["eval"].get("backend", "PufferEnv")
-        args["env"]["map_dir"] = args["eval"]["map_dir"]
-        args["env"]["num_agents"] = args["eval"]["human_replay_num_agents"]
-        args["env"]["num_maps"] = len([f for f in os.listdir(args["env"]["map_dir"]) if f.endswith(".bin")])
-
-        args["vec"] = dict(backend=backend, num_envs=1)
-        args["env"]["control_mode"] = args["eval"]["human_replay_control_mode"]
-        args["env"]["scenario_length"] = args["eval"].get("scenario_length", 201)
-
-        vecenv = vecenv or load_env(env_name, args)
-        policy = policy or load_policy(args, vecenv, env_name)
-
-        print(f"Effective number of scenarios used: {len(vecenv.driver_env.agent_offsets) - 1}")
-
-        evaluator = HumanReplayEvaluator(args)
-
-        # Run rollouts with human replays
-        results = evaluator.rollout(args, vecenv, policy)
-
-        import json
-
-        print("HUMAN_REPLAY_METRICS_START")
-        print(json.dumps(results))
-        print("HUMAN_REPLAY_METRICS_END")
-
-        return results
-
-    else:  # Standard evaluation: Render
-        backend = args["vec"]["backend"]
-        if backend != "PufferEnv":
-            backend = "Serial"
-
-        args["vec"] = dict(backend=backend, num_envs=1)
-        vecenv = vecenv or load_env(env_name, args)
-        policy = policy or load_policy(args, vecenv, env_name)
-
-        ob, info = vecenv.reset()
-        driver = vecenv.driver_env
-        num_agents = vecenv.observation_space.shape[0]
-        device = args["train"]["device"]
-
-        state = {}
-        if args["train"]["use_rnn"]:
-            state = dict(
-                lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-                lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
-            )
-
-        frames = []
-        while True:
-            render = driver.render()
-            if len(frames) < args["save_frames"]:
-                frames.append(render)
-
-            # Screenshot Ocean envs with F12, gifs with control + F12
-            if driver.render_mode == "ansi":
-                print("\033[0;0H" + render + "\n")
-                time.sleep(1 / args["fps"])
-            elif driver.render_mode == "rgb_array":
-                pass
-                # import cv2
-                # render = cv2.cvtColor(render, cv2.COLOR_RGB2BGR)
-                # cv2.imshow('frame', render)
-                # cv2.waitKey(1)
-                # time.sleep(1/args['fps'])
-
-            with torch.no_grad():
-                ob = torch.as_tensor(ob).to(device)
-                logits, value = policy.forward_eval(ob, state)
-                action, logprob, _ = pufferlib.pytorch.sample_logits(logits)
-                action = action.cpu().numpy().reshape(vecenv.action_space.shape)
-
-            if isinstance(logits, torch.distributions.Normal):
-                action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
-
-            ob = vecenv.step(action)[0]
-
-            if len(frames) > 0 and len(frames) == args["save_frames"]:
-                import imageio
-
-                imageio.mimsave(args["gif_path"], frames, fps=args["fps"], loop=0)
-                frames.append("Done")
-
-
-def load_eval_multi_scenarios_config(env_name, model_path=None, eval_overrides=None):
-    """Load config for evaluation, merging experiment YAML with defaults."""
-    args = load_config(env_name)
-    if model_path:
-        experiment_dir = os.path.dirname(os.path.dirname(model_path))
-        config_yaml_path = os.path.join(experiment_dir, "config.yaml")
-        EXCLUDE_KEYS = eval_overrides["env"].keys()
-        # Override Policy and RNN dimensions from training config
-        if os.path.exists(config_yaml_path):
-            print(f"Found config.yaml at {config_yaml_path}. Merging with defaults...")
-            with open(config_yaml_path, "r") as f:
-                yaml_config = yaml.safe_load(f)
-
-            for section in ["env", "policy", "rnn"]:
-                if section in yaml_config and isinstance(yaml_config[section], dict):
-                    for k, v in yaml_config[section].items():
-                        if k not in EXCLUDE_KEYS:
-                            args[section][k] = v
-
-            # Also copy root-level keys like rnn_name, policy_name
-            for key in ["rnn_name", "policy_name"]:
-                if key in yaml_config:
-                    args[key] = yaml_config[key]
-
-            # Update use_rnn based on rnn_name
-            args["train"]["use_rnn"] = args["rnn_name"] is not None
-
-    # Override env parameters from evaluation config
-    if eval_overrides:
-        for section, section_overrides in eval_overrides.items():
-            if isinstance(section_overrides, dict):
-                for k, v in section_overrides.items():
-                    args[section][k] = v
-            else:
-                args[section] = section_overrides
-
-    return args
-
-
-def build_eval_overrides(
-    simulation_mode, num_agents, num_scenarios, map_dir=None, num_carla_maps=8, clean=False, scenario_length=None
-):
-    """Build evaluation overrides for a given simulation mode.
-
-    Args:
-        simulation_mode: "gigaflow" or "replay"
-        num_agents: agent slot budget for evaluation
-        map_dir: replay dataset directory, required for replay mode
-        clean: if True, run a "clean" eval — zero road-segment dropout and
-            enforce red-light stops. Only safe when the policy is rebuilt
-            from the eval env (standalone eval / render_scenario.py). Inline
-            eval during training reuses the live training policy, whose
-            encoder was built for the training obs shape; zeroing dropout
-            there changes the obs shape and triggers a CUDA device-side
-            assert. Perturbation probabilities (partner_blindness,
-            phantom_braking) are always forced to zero at eval — they're
-            pure randomness, they don't change the obs shape, and eval
-            should be deterministic regardless of clean mode.
-        scenario_length: replay-mode scenarios per-step count (also used as
-            resample_frequency). Defaults to 91 — WOMD's 9.1s @ 10Hz. nuPlan
-            scenes from the categorized py123d pipeline want 201 (20.1s).
-            Ignored in gigaflow mode (procedural episodes always run for the
-            hardcoded 3000-step budget).
-    """
-    # Common reward coefficients (same for both modes)
-    common_env = {
-        "eval_mode": 1,
-        "collision_behavior": 1,
-        "offroad_behavior": 1,
-        "traffic_light_behavior": 1 if clean else 0,
-        "reward_randomization": False,
-        "reward_vehicle_collision": 3.0,
-        "reward_offroad_collision": 3.0,
-        "reward_ade": 0.0,
-        "reward_goal": 1.0,
-        "reward_overspeed": 0.05,
-        "reward_comfort": 0.05,
-        "reward_velocity": 0.0025,
-        "reward_lane_align": 0.025,
-        "reward_lane_center": 0.0038,
-        "reward_timestep": 0.000025,
-        # Always zero perturbations at eval. These don't change obs shape so
-        # it's safe to force even for inline eval, and a deterministic eval
-        # is what we want for tracking progress.
-        "partner_blindness_prob": 0.0,
-        "phantom_braking_prob": 0.0,
-        "phantom_braking_trigger_prob": 0.0,
-    }
-
-    if clean:
-        # Dropout changes the obs shape. Only safe when the policy is
-        # rebuilt from the eval env (standalone eval / render_scenario).
-        # NEVER pass clean=True from an inline-eval call site — the live
-        # training policy's encoder was built for the training obs shape.
-        common_env["lane_segment_dropout"] = 0.0
-        common_env["boundary_segment_dropout"] = 0.0
-
-    if simulation_mode == "gigaflow":
-        eval_overrides = {
-            "env": {
-                **common_env,
-                "simulation_mode": "gigaflow",
-                "min_agents_per_env": 50,
-                "max_agents_per_env": 50,
-                "resample_frequency": 3000,
-                "scenario_length": 3000,
-                # Point at the py123d-converted CARLA towns added to this branch.
-                # The older binaries/carla dir predates the 123Drive pipeline and
-                # is not populated on emerge/temp_training.
-                "map_dir": map_dir or "pufferlib/resources/drive/binaries/carla_py123d",
-                "num_maps": num_carla_maps,
-                "num_agents": num_agents,
-                "termination_mode": 0.0,
-            }
-        }
-    elif simulation_mode == "replay":
-        replay_len = scenario_length if scenario_length is not None else 91
-        eval_overrides = {
-            "env": {
-                **common_env,
-                "simulation_mode": "replay",
-                "resample_frequency": replay_len,
-                "scenario_length": replay_len,
-                "max_agents_per_env": 64,
-                "map_dir": map_dir or "pufferlib/resources/drive/binaries/womd",
-                "num_maps": num_scenarios,
-                "num_agents": num_agents,
-                "min_agents_per_env": 1,
-                "termination_mode": 0.0,
-                # "control_mode": "control_sdc_only",
-            },
-        }
-    else:
-        raise ValueError(f"Invalid simulation_mode: {simulation_mode}. Must be 'gigaflow' or 'replay'.")
-
-    return eval_overrides
-
-
-@contextlib.contextmanager
-def _swap_policy_obs_counts(policy, vecenv):
-    """Temporarily align the policy's road-segment slicing with the eval env.
-
-    Training uses dropout > 0 → smaller obs_{lane,boundary}_segment_count.
-    Clean eval uses dropout = 0 → larger counts, larger obs buffer. The
-    GigaFlow encoder (lane_encoder / boundary_encoder) is a shared MLP
-    applied per-segment with max-pool — its weights are count-invariant.
-    Only the obs-buffer slicing in DriveBackbone.forward depends on these
-    counts, so we can just swap them for the duration of the eval and the
-    same training policy works on the larger clean obs.
-    """
-    try:
-        eval_env = vecenv.driver_env
-        new_lane = int(eval_env.obs_lane_segment_count)
-        new_boundary = int(eval_env.obs_boundary_segment_count)
-    except AttributeError:
-        # If the eval env doesn't expose these (unknown wrapper), skip the
-        # swap — forward will still work when training and eval obs shapes
-        # coincide (clean=False or no dropout configured).
-        yield
-        return
-
-    targets = []
-    for m in policy.modules():
-        if hasattr(m, "obs_lane_segment_count") and hasattr(m, "obs_boundary_segment_count"):
-            targets.append(m)
-
-    saved = [(m.obs_lane_segment_count, m.obs_boundary_segment_count) for m in targets]
-    try:
-        for m in targets:
-            m.obs_lane_segment_count = new_lane
-            m.obs_boundary_segment_count = new_boundary
-        yield
-    finally:
-        for m, (orig_lane, orig_boundary) in zip(targets, saved):
-            m.obs_lane_segment_count = orig_lane
-            m.obs_boundary_segment_count = orig_boundary
-
-
-def verify_scenario_coverage(csv_path: str, num_scenarios: int) -> dict:
-    """
-    Verify that episode_metrics.csv contains all expected scenarios.
-
-    Args:
-        csv_path: Path to episode_metrics.csv
-        num_scenarios: Expected number of scenarios (e.g., 1000)
-
-    Returns:
-        dict with keys:
-            - complete: bool - True if all scenarios present
-            - expected_count: number of expected scenarios
-            - found_count: number of unique scenarios found
-            - missing: sorted list of missing map names
-            - extra: sorted list of unexpected map names
-            - duplicates: dict mapping map_name -> count (if >1)
-    """
-    df = pd.read_csv(csv_path)
-
-    # Expected: map_000, map_001, ..., map_{num_scenarios-1}
-    expected = {f"map_{i:03d}" for i in range(num_scenarios)}
-    found = set(df["map_name"].unique())
-
-    missing = expected - found
-    extra = found - expected
+def eval(env_name, args=None, vecenv=None, policy=None, evaluator_name=None, out_path=None):
+    """Run a single named evaluator from drive.ini.
 
-    # Check for duplicates
-    counts = df["map_name"].value_counts()
-    duplicates = {name: count for name, count in counts.items() if count > 1}
+    Standalone form: `puffer eval puffer_drive --evaluator <name>`. The
+    evaluator's config (env/vec overrides, render flag, etc.) comes from
+    the [eval.<name>] section. Loads the policy from `--load-model-path`.
 
-    complete = len(missing) == 0
-
-    return {
-        "complete": complete,
-        "expected_count": num_scenarios,
-        "found_count": len(found),
-        "missing": sorted(missing),
-        "extra": sorted(extra),
-        "duplicates": duplicates,
-    }
-
-
-def verify_scenario_coverage_gigaflow(csv_path: str, num_scenarios: int) -> dict:
-    """
-    Verify gigaflow evaluation CSV: maps repeat across scenarios, so check total
-    row count rather than unique map names.
+    Subprocess form: `--out <json>` writes the result dict to a JSON file
+    so the parent EvalManager can read structured metrics back without
+    parsing stdout.
     """
-    df = pd.read_csv(csv_path)
-    total_rows = len(df)
-    complete = total_rows == num_scenarios
-    return {
-        "complete": complete,
-        "expected_count": num_scenarios,
-        "found_count": total_rows,
-    }
-
-
-# Helper functions for eval_multi_scenarios and eval_multi_scenarios_render
-def _export_metrics(global_infos, eval_folder, num_scenarios, quiet, verify_coverage=False, simulation_mode="replay"):
-    """Export episode and summary CSVs, return avg_infos dict."""
-    # Episode Metrics
-    try:
-        df_episodes = pd.DataFrame(global_infos)
-        first_cols = ["episode_id", "map_name"]
-        other_cols = [col for col in df_episodes.columns if col not in first_cols]
-        new_col_order = first_cols + other_cols
-        df_episodes = df_episodes[new_col_order]
-
-        if verify_coverage:
-            df_episodes = df_episodes.sort_values(by=["map_name", "episode_id"])
-
-        episode_csv_path = os.path.join(eval_folder, "episode_metrics.csv")
-        df_episodes.to_csv(episode_csv_path, index=False)
-        if not quiet:
-            print(f"\n✅ Per-episode metrics exported to {episode_csv_path}")
-
-        if verify_coverage:
-            if simulation_mode == "gigaflow":
-                result = verify_scenario_coverage_gigaflow(episode_csv_path, num_scenarios)
-                if not quiet:
-                    if result["complete"]:
-                        print(f"✅ All {num_scenarios} episodes present in CSV")
-                    else:
-                        print(
-                            f"⚠️ Episode count mismatch: expected {result['expected_count']}, found {result['found_count']}"
-                        )
-            else:
-                result = verify_scenario_coverage(episode_csv_path, num_scenarios)
-                if not quiet:
-                    if result["complete"]:
-                        print(f"✅ All {num_scenarios} scenarios present in CSV")
-                    else:
-                        print(f"⚠️ Scenario coverage incomplete:")
-                        print(f"   Expected: {result['expected_count']}, Found: {result['found_count']}")
-                        if result["missing"]:
-                            print(f"   Missing ({len(result['missing'])}): {result['missing']}")
-                        if result["extra"]:
-                            print(f"   Extra: {result['extra'][:10]}...")
-                    if result["duplicates"]:
-                        print(f"   Duplicates: {len(result['duplicates'])} scenarios have multiple entries")
-                        for name, count in sorted(result["duplicates"].items()):
-                            print(f"      {name}: {count} entries")
-    except Exception as e:
-        print(f"\n⚠️ Could not export per-episode CSV. Error: {e}")
-        print("Global infos data:", global_infos)
-
-    # Evaluation average metrics
-    avg_infos = {}
-    for k, v in global_infos.items():
-        if k == "num_scenarios":
-            avg_infos[k] = np.sum(v)
-        elif v and isinstance(v[0], numbers.Number):
-            avg_infos[k] = np.mean(v)
-    df_summary = pd.DataFrame(list(avg_infos.items()), columns=["Metric", "Average"])
-    summary_csv_path = os.path.join(eval_folder, "evaluation_summary.csv")
-    df_summary.to_csv(summary_csv_path, index=False)
-    if not quiet:
-        print(f"\n✅ Average results exported to {summary_csv_path}")
-        print(df_summary.to_string(index=False))
-
-    return avg_infos
-
-
-def _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet):
-    """Log metrics to TensorBoard/wandb if logger is provided."""
-    if logger is None or args.get("global_step") is None:
-        return
-
-    global_step = args["global_step"]
-
-    # Create log dict with metric prefix (use / for TensorBoard grouping)
-    log_dict = {}
-    for metric_key, metric_value in avg_infos.items():
-        if isinstance(metric_value, (int, float)):
-            log_dict[f"{metric_prefix}/{metric_key}"] = float(metric_value)
-
-    # Log to TensorBoard if available
-    if hasattr(logger, "local_writer") and logger.local_writer:
-        for key, value in log_dict.items():
-            logger.local_writer.add_scalar(key, value, global_step)
-        if not quiet:
-            print(f"✅ Logged {len(log_dict)} validation metrics to TensorBoard at step {global_step}")
-
-    # Also log to wandb/neptune if available
-    if hasattr(logger, "log"):
-        logger.log(log_dict, global_step)
-
-
-def eval_multi_scenarios(
-    env_name,
-    args=None,
-    vecenv=None,
-    policy=None,
-    logger=None,
-    metric_prefix="validation",
-    quiet=False,
-    clean=False,
-):
-    t0 = time.time()
-
-    if args is None:
-        tmp_args = load_config(env_name)
-        model_path = tmp_args.get("load_model_path")
-        num_agents_eval = tmp_args["eval"]["num_agents"]
-        map_dir = tmp_args["eval"]["map_dir"]
-
-        # CLI standalone entry point: read clean_eval from the eval section
-        # so users can enable it via --eval.clean-eval. Inline callers pass
-        # clean= directly and come in through the args-provided branch.
-        clean_from_config = tmp_args["eval"].get("clean_eval", False)
-        eval_overrides = build_eval_overrides(
-            simulation_mode=tmp_args["eval_simulation"],
-            num_agents=num_agents_eval,
-            num_scenarios=tmp_args["num_scenarios"],
-            map_dir=map_dir,
-            num_carla_maps=tmp_args.get("num_carla_maps", 8),
-            clean=clean_from_config,
-            scenario_length=tmp_args["eval"].get("scenario_length"),
-        )
-        args = load_eval_multi_scenarios_config(env_name, model_path, eval_overrides)
-        clean = clean or clean_from_config
-
-    # Reproducibility — same approach as training
-    seed = args["train"]["seed"] or 42
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-
-    backend = args["vec"]["backend"]
-    num_scenarios = args["num_scenarios"]
-
-    num_workers = min(args["vec"]["num_envs"], num_scenarios)
-
-    # Distribute scenarios across workers
-    scenarios_per_worker = num_scenarios // num_workers
-    remainder = num_scenarios % num_workers
-    current_start = 0
-    env_kwargs_list = []
-    for j in range(num_workers):
-        worker_kwargs = copy.deepcopy(args["env"])
-        worker_num_scenario = scenarios_per_worker + (1 if j < remainder else 0)
-        worker_kwargs["starting_map"] = current_start
-        worker_kwargs["num_eval_scenarios"] = worker_num_scenario
-        env_kwargs_list.append(worker_kwargs)
-        current_start += worker_num_scenario
-
-    print(f"Distributing {num_scenarios} scenarios across {num_workers} workers:")
-    for j, w in enumerate(env_kwargs_list):
-        start = w["starting_map"]
-        count = w["num_eval_scenarios"]
-        print(f"  Worker {j}: maps {start}-{start + count - 1} ({count} scenarios)")
-
-    args["vec"] = dict(backend=backend, num_envs=num_workers, num_workers=num_workers, batch_size=num_workers)
-
-    if vecenv is None:
-        package = args["package"]
-        module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}"
-        env_module = importlib.import_module(module_name)
-        make_env = env_module.env_creator(env_name)
-        # Pass as lists to preserve per-worker env_kwargs
-        env_creators = [make_env] * num_workers
-        env_args = [[]] * num_workers
-        vecenv = pufferlib.vector.make(env_creators, env_args=env_args, env_kwargs=env_kwargs_list, **args["vec"])
+    from pufferlib.ocean.benchmark.manager import EvalManager
 
-    policy = policy or load_policy(args, vecenv, env_name)
-    policy.eval()
-    num_agents = vecenv.observation_space.shape[0]
-    device = args["train"]["device"]
+    args = args or load_config(env_name)
 
-    state = {}
-    if args["train"]["use_rnn"]:
-        state = dict(
-            lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-            lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
+    if evaluator_name is None:
+        evaluator_name = args.get("evaluator")
+    if evaluator_name is None:
+        raise pufferlib.APIUsageError(
+            "puffer eval requires --evaluator <name>; named [eval.<name>] sections live in drive.ini"
         )
 
-    # Folder for evaluation results
-    # For inline evaluation during training, use eval_results_dir in experiments folder
-    # For standalone evaluation, use benchmark folder
-    if "inline_eval" in args and args["inline_eval"] and "eval_results_dir" in args:
-        eval_folder = args["eval_results_dir"]
-    else:
-        # Standalone evaluation path (in benchmark folder)
-        model_path = args["load_model_path"]
-        if model_path is None:
-            eval_folder = os.path.join("benchmark", "no_policy", args["eval_simulation"])
-        else:
-            model_filename_with_ext = os.path.basename(model_path)
-            model_name = os.path.splitext(model_filename_with_ext)[0]
-            models_dir = os.path.dirname(model_path)
-            experiment_dir = os.path.dirname(models_dir)
-            experiment_name = os.path.basename(experiment_dir)
-            eval_folder = os.path.join("benchmark", experiment_name, model_name, args["eval_simulation"])
-    os.makedirs(eval_folder, exist_ok=True)
-
-    global_infos = {}
-    scenarios_processed = 0
-    vecenv.async_reset(42)
-
-    ob, _, _, _, infos, _, _ = vecenv.recv()
-    # Clean eval may use different road-dropout than training. The shared
-    # training policy's obs slicing needs to be aligned with this env; see
-    # _swap_policy_obs_counts.
-    swap_ctx = _swap_policy_obs_counts(policy, vecenv) if clean else contextlib.nullcontext()
-    with swap_ctx, tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar:
-        while scenarios_processed < num_scenarios:
-            # Reset LSTM
-            if args["train"]["use_rnn"]:
-                state = dict(
-                    lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-                    lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
-                )
-
-            for _ in range(args["env"]["scenario_length"]):
-                with torch.no_grad():
-                    ob = torch.as_tensor(ob).to(device)
-                    logits, _ = policy.forward_eval(ob, state)
-                    action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
-                    action = action.cpu().numpy().reshape(vecenv.action_space.shape)
-
-                if isinstance(logits, torch.distributions.Normal):
-                    action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
-
-                ob, _, _, _, infos = vecenv.step(action)
-
-                # Multi-worker backend returns infos as list of lists (one per worker)
-                if infos and infos[0]:
-                    for sub_env in infos:
-                        for env_idx, summary in enumerate(sub_env):
-                            env_map_name = summary["map_name"].split("/")[-1].split(".")[0]
-                            summary["episode_id"] = env_idx
-                            summary["map_name"] = env_map_name
-                            scenarios_processed += 1
-                            pbar.update(1)
-
-                            for k, v in summary.items():
-                                if k not in global_infos:
-                                    global_infos[k] = []
-                                global_infos[k].append(v)
-
-    avg_infos = _export_metrics(
-        global_infos,
-        eval_folder,
-        num_scenarios,
-        quiet,
-        verify_coverage=True,
-        simulation_mode=args["env"]["simulation_mode"],
+    manager = EvalManager.from_config(args)
+
+    # Build a fresh vecenv inside the manager via the evaluator's overrides.
+    # Policy can come from a checkpoint (load_model_path) or be passed in.
+    if policy is None:
+        # Need a probe vecenv just to construct the policy with the right
+        # obs/action spaces. Use the matching evaluator's env_overrides so
+        # the obs shape matches what the rollout will see.
+        target = next((e for e in manager.evaluators if e.name == evaluator_name), None)
+        if target is None:
+            raise KeyError(f"No [eval.{evaluator_name}] section found. Known: {[e.name for e in manager.evaluators]}")
+        probe_args = manager._build_eval_args(target, env_name=env_name, global_step=None)
+        probe_vec = load_env(env_name, probe_args)
+        policy = load_policy(probe_args, probe_vec, env_name)
+        probe_vec.close()
+
+    result = manager.run_one_by_name(
+        evaluator_name,
+        policy=policy,
+        env_name=env_name,
+        logger=None,
+        global_step=args.get("global_step"),
     )
-    print(f"\nTotal evaluation time: {time.time() - t0:.2f} seconds for {num_scenarios} scenarios.")
-    _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet)
-
-    # Close vectorized environment to avoid file descriptor leaks
-    vecenv.close()
-
-
-def eval_multi_scenarios_render(
-    env_name,
-    args=None,
-    vecenv=None,
-    policy=None,
-    logger=None,
-    metric_prefix="validation",
-    quiet=False,
-    render_backend="html",
-    view_mode=0,
-    video_suffix="",
-    log_view_label="render",
-    render_max_steps=None,
-    render_key_prefix=None,
-    clean=False,
-):
-    # Set fixed seed for reproducible evaluation
-    np.random.seed(42)
-    torch.manual_seed(42)
-
-    if args is None:
-        tmp_args = load_config(env_name)
-        model_path = tmp_args.get("load_model_path")
-        num_agents_eval = tmp_args["eval"]["num_agents"]
-        map_dir = tmp_args["eval"]["map_dir"]
-        clean_from_config = tmp_args["eval"].get("clean_eval", False)
-        eval_overrides = build_eval_overrides(
-            simulation_mode=tmp_args["eval_simulation"],
-            num_agents=num_agents_eval,
-            num_scenarios=tmp_args["num_scenarios"],
-            map_dir=map_dir,
-            num_carla_maps=tmp_args.get("num_carla_maps", 8),
-            clean=clean_from_config,
-            scenario_length=tmp_args["eval"].get("scenario_length"),
-        )
-        args = load_eval_multi_scenarios_config(env_name, model_path, eval_overrides)
-        clean = clean or clean_from_config
-
-    backend = args["vec"]["backend"]
-    if backend != "PufferEnv":
-        backend = "Serial"
-
-    args["vec"] = dict(backend=backend, num_envs=1)
-    args["env"]["num_eval_scenarios"] = args["num_scenarios"]  # first batch: fill as many scenarios as fit
-
-    # Backend selection.
-    #   "html" — the existing viz.generate_interactive_replay path (CPU-only,
-    #            self-contained HTML per scenario).
-    #   "egl"  — the C-side render.h → make_client → client_record_frame
-    #            pipeline (EGL GPU context, PBO double-buffer readback,
-    #            writev → ffmpeg libx264, one mp4 per scenario).
-    egl_mode = bool(args.get("render")) and render_backend == "egl"
-    html_mode = bool(args.get("render")) and not egl_mode
-    if egl_mode:
-        # Force the C env to RENDER_HEADLESS so make_client spawns ffmpeg and
-        # (under DRIVE_HAS_EGL) switches the active GL context to the GPU.
-        args["env"]["render_mode"] = "headless"
-
-    vecenv = vecenv or load_env(env_name, args)
-
-    policy = policy or load_policy(args, vecenv, env_name)
-    policy.eval()
-    num_agents = vecenv.observation_space.shape[0]
-    device = args["train"]["device"]
-
-    state = {}
-    if args["train"]["use_rnn"]:
-        state = dict(
-            lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-            lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
-        )
-
-    # Folder for evaluation results
-    # For inline evaluation during training, use eval_results_dir in experiments folder
-    # For standalone evaluation, use benchmark folder
-    if "inline_eval" in args and args["inline_eval"] and "eval_results_dir" in args:
-        eval_folder = args["eval_results_dir"]
-    else:
-        # Standalone evaluation path (in benchmark folder)
-        model_path = args["load_model_path"]
-        if model_path is None:
-            eval_folder = os.path.join("benchmark", "no_policy", args["eval_simulation"])
-        else:
-            model_filename_with_ext = os.path.basename(model_path)
-            model_name = os.path.splitext(model_filename_with_ext)[0]
-            models_dir = os.path.dirname(model_path)
-            experiment_dir = os.path.dirname(models_dir)
-            experiment_name = os.path.basename(experiment_dir)
-            eval_folder = os.path.join("benchmark", experiment_name, model_name, args["eval_simulation"])
-    os.makedirs(eval_folder, exist_ok=True)
-
-    saved_cwd = None
-    mp4_folder = None
-    gif_folder = None
-    if html_mode:
-        gif_folder = eval_folder + "/gif"
-        os.makedirs(gif_folder, exist_ok=True)
-    if egl_mode:
-        mp4_folder = os.path.join(eval_folder, "mp4")
-        os.makedirs(mp4_folder, exist_ok=True)
-        # C-side make_client writes <scenario_id>.mp4 into the process cwd. We
-        # chdir into mp4_folder so every scenario's file lands in the right
-        # place, then restore cwd after the rollout loop.
-        saved_cwd = os.getcwd()
-        os.chdir(mp4_folder)
-
-    global_infos = {}
-    num_scenarios = args["num_scenarios"]
-
-    # Apply per-env video suffix once before any render. make_client reads
-    # env->video_suffix on the first render to build the ffmpeg filename, so
-    # this must fire before any step. We don't yet know how many internal
-    # envs are in the vecenv (vecenv.get_state() only works after reset),
-    # so set on a generous prefix and let extras be no-ops.
-    if egl_mode and video_suffix:
-        _target_env_pre = vecenv if not hasattr(vecenv, "envs") else vecenv.envs[0]
-        # Drive exposes its internal C-level vec env count via num_envs.
-        # Use it as the loop bound so we never call set_video_suffix on an
-        # out-of-range env_id (which would corrupt memory before the C
-        # bounds check landed).
-        _internal_num_envs = getattr(_target_env_pre, "num_envs", 1)
-        for _e in range(_internal_num_envs):
-            try:
-                _target_env_pre.set_video_suffix(video_suffix, env_idx=_e)
-            except Exception:
-                break
 
-    scenarios_processed = 0
-    # PufferEnv native backend: vecenv IS the Drive env (no .envs list).
-    # Serial/Multiprocessing: need vecenv.envs[0] to reach the underlying env.
-    target_env = vecenv if not hasattr(vecenv, "envs") else vecenv.envs[0]
-
-    # Align the live training policy's obs slicing with the (potentially
-    # clean) eval env for the render. Same swap as eval_multi_scenarios.
-    swap_ctx = _swap_policy_obs_counts(policy, vecenv) if clean else contextlib.nullcontext()
-    with swap_ctx, tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar:
-        while scenarios_processed < num_scenarios:
-            ob, _ = vecenv.reset()
-
-            # Get initial states for all environments in the batch
-            scenarios = vecenv.get_state()
-            num_envs_in_batch = len(scenarios)
-            batch_start = scenarios_processed
-
-            # Prepare batch_size_eval for the resample that fires at end of the step loop.
-            # That resample will load the NEXT batch, so cap it at remaining_after_this.
-            remaining_after_this = num_scenarios - scenarios_processed - num_envs_in_batch
-            target_env.batch_size_eval = max(1, remaining_after_this)
-
-            map_names = []
-            for env_idx in range(num_envs_in_batch):
-                map_names.append(scenarios[env_idx]["map_name"].split("/")[-1].split(".")[0])
-
-            # Reset LSTM
-            if args["train"]["use_rnn"]:
-                state = dict(
-                    lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-                    lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
-                )
-
-            # Initialize histories as lists of lists (one list per environment).
-            # Only needed for the HTML replay path — EGL writes mp4 frames
-            # directly to ffmpeg via c_render each step.
-            if html_mode:
-                agent_histories = [[] for _ in range(num_envs_in_batch)]
-                traffic_histories = [[] for _ in range(num_envs_in_batch)]
-                trajectory_histories = [[] for _ in range(num_envs_in_batch)]
-                all_agents_obs_histories = [[] for _ in range(num_envs_in_batch)]
-
-            _render_steps = args["env"]["scenario_length"]
-            if render_max_steps is not None:
-                _render_steps = min(_render_steps, render_max_steps)
-            for t in range(_render_steps):
-                if html_mode:
-                    current_scenarios = vecenv.get_state()
-                    start_obs_index = 0
-
-                    # Loop through every environment in the batch to record its history
-                    for env_idx in range(num_envs_in_batch):
-                        env_scenario = current_scenarios[env_idx]
-
-                        agent_histories[env_idx].append(
-                            pufferlib.viz.fill_agents_state(
-                                env_scenario, use_trajectory="trajectory" in args["env"]["action_type"]
-                            )
-                        )
-                        traffic_histories[env_idx].append(pufferlib.viz.fill_traffics_state(env_scenario, t))
-
-                        if "trajectory" in args["env"]["action_type"]:
-                            trajectory_histories[env_idx].append(pufferlib.viz.fill_trajectories(env_scenario, t))
-
-                        # Collect observation dictionaries for ALL active agents in THIS environment at timestep t
-                        if args["render_obs"]:
-                            step_obs_dict = {}
-                            if env_idx > 0:
-                                start_obs_index += current_scenarios[env_idx - 1]["active_agent_count"]
-                            for agent_idx in range(env_scenario["active_agent_count"]):
-                                agent_id = env_scenario["active_agent_indices"][agent_idx]
-                                step_obs_dict[int(agent_id)] = pufferlib.viz.extract_obs_frame(
-                                    ob,
-                                    env_scenario,
-                                    args,
-                                    timestep=t,
-                                    obs_index=start_obs_index + agent_idx,
-                                    agent_idx=agent_idx,
-                                    head_north=True,
-                                )
-                            all_agents_obs_histories[env_idx].append(step_obs_dict)
-
-                with torch.no_grad():
-                    ob = torch.as_tensor(ob).to(device)
-                    logits, _ = policy.forward_eval(ob, state)
-                    action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
-                    action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+    print("EVAL_RESULT_JSON_START")
+    import json
 
-                if isinstance(logits, torch.distributions.Normal):
-                    action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
-
-                ob, _, _, _, infos = vecenv.step(action)
-
-                if egl_mode:
-                    # Flush one frame per env through c_render → client_record_frame
-                    # → PBO async readback → writev → ffmpeg pipe. make_client is
-                    # called lazily on the first render per env (sets up ffmpeg +
-                    # GPU context) and close_client at scenario end flushes the
-                    # trailing PBO frame.
-                    for e in range(num_envs_in_batch):
-                        target_env.render(env_idx=e, view_mode=view_mode)
-
-                # Serial backend returns infos as single list (infos[0] is the env's info list)
-                if infos and infos[0]:
-                    for env_idx, summary in enumerate(infos[0]):
-                        env_map_name = summary["map_name"].split("/")[-1].split(".")[0]
-                        summary["episode_id"] = batch_start + env_idx
-                        summary["env_id"] = env_idx
-                        summary["map_name"] = env_map_name
-
-                        for k, v in summary.items():
-                            if k not in global_infos:
-                                global_infos[k] = []
-                            global_infos[k].append(v)
-
-            if html_mode:
-                # Loop through every environment to generate its specific HTML replay
-                for env_idx in range(num_envs_in_batch):
-                    global_episode_id = batch_start + env_idx
-                    # Ensure we don't render padding environments if num_scenarios isn't perfectly divisible by batch_size
-                    if global_episode_id >= num_scenarios:
-                        break
-                    env_map_name = map_names[env_idx]
-
-                    pufferlib.viz.generate_interactive_replay(
-                        current_scenarios[env_idx],
-                        agent_histories[env_idx],
-                        traffic_histories[env_idx],
-                        trajectory_histories[env_idx],
-                        all_agents_obs_histories[env_idx],
-                        f"{gif_folder}/{env_map_name}_{global_episode_id:03d}.html",
-                        head_north=True,
-                    )
+    print(json.dumps({"name": evaluator_name, "metrics": result.metrics}))
+    print("EVAL_RESULT_JSON_END")
 
-            if egl_mode:
-                # Close every env's Client so ffmpeg gets EOF on its input pipe,
-                # the trailing PBO frame is flushed, and libx264 writes the mp4
-                # trailer. Without this, the mp4 files are either empty or one
-                # frame short.
-                import sys as _sys_cc
+    if out_path:
+        with open(out_path, "w") as f:
+            json.dump(
+                {"name": evaluator_name, "metrics": result.metrics, "frames": [str(p) for p in result.frames]},
+                f,
+            )
 
-                _sys_cc.stderr.write(
-                    f"[render-instr] starting close_client loop num_envs_in_batch={num_envs_in_batch}\n"
-                )
-                _sys_cc.stderr.flush()
-                for e in range(num_envs_in_batch):
-                    _sys_cc.stderr.write(f"[render-instr] close_client(env_idx={e}) calling\n")
-                    _sys_cc.stderr.flush()
-                    target_env.close_client(env_idx=e)
-                    _sys_cc.stderr.write(f"[render-instr] close_client(env_idx={e}) returned\n")
-                    _sys_cc.stderr.flush()
-
-            scenarios_processed += num_envs_in_batch
-            pbar.update(num_envs_in_batch)
-
-    import sys as _sys_instr
-
-    _sys_instr.stderr.write("[render-instr] rollout loop done\n")
-    _sys_instr.stderr.flush()
-
-    # render_key_prefix overrides metric_prefix for wandb media uploads only.
-    # This lets callers keep metric_prefix for scalar metrics while using a
-    # different namespace for renders (e.g. driving_behaviours/<class>/render/<view>).
-    _upload_prefix = render_key_prefix if render_key_prefix is not None else metric_prefix
-
-    if html_mode:
-        pufferlib.viz.build_gallery_index(gif_folder)
-        if logger is not None:
-            try:
-                import wandb
-
-                html_paths = sorted(os.path.join(gif_folder, f) for f in os.listdir(gif_folder) if f.endswith(".html"))
-                if html_paths:
-                    step = args.get("global_step")
-                    # Stable key per (category, view); each render epoch overwrites
-                    # the same wandb panel rather than fanning out by scenario UUID.
-                    html_log = {_upload_prefix: wandb.Html(html_paths[-1])}
-                    if hasattr(logger, "log"):
-                        logger.log(html_log, step) if step is not None else logger.log(html_log)
-                    if not quiet:
-                        print(f"Uploaded {len(html_paths)} render HTML(s) to wandb")
-            except Exception as e:
-                if not quiet:
-                    print(f"Failed to upload render HTMLs to wandb: {e}")
-
-    if saved_cwd is not None:
-        os.chdir(saved_cwd)
-    _sys_instr.stderr.write("[render-instr] chdir restored\n")
-    _sys_instr.stderr.flush()
-
-    avg_infos = _export_metrics(global_infos, eval_folder, num_scenarios, quiet, verify_coverage=False)
-    _sys_instr.stderr.write("[render-instr] _export_metrics done\n")
-    _sys_instr.stderr.flush()
-    _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet)
-    _sys_instr.stderr.write("[render-instr] _log_eval_metrics done\n")
-    _sys_instr.stderr.flush()
-
-    if egl_mode and mp4_folder and logger is not None:
-        try:
-            import wandb
-
-            mp4_paths = sorted(os.path.join(mp4_folder, f) for f in os.listdir(mp4_folder) if f.endswith(".mp4"))
-            if mp4_paths:
-                # Log under a single stable key per (category, view) so successive
-                # renders show up in the same wandb panel as a time series.
-                # The scenario UUID lives in the caption, not the key.
-                videos = [
-                    wandb.Video(p, fps=30, format="mp4", caption=os.path.splitext(os.path.basename(p))[0])
-                    for p in mp4_paths
-                ]
-                video_log = {_upload_prefix: videos if len(videos) > 1 else videos[0]}
-                step = args.get("global_step")
-                if hasattr(logger, "log"):
-                    logger.log(video_log, step) if step is not None else logger.log(video_log)
-                if not quiet:
-                    print(f"Uploaded {len(mp4_paths)} render mp4(s) to wandb")
-        except Exception as e:
-            if not quiet:
-                print(f"Failed to upload render mp4s to wandb: {e}")
-
-    # Close vectorized environment to avoid file descriptor leaks
-    vecenv.close()
+    return result.metrics
 
 
 def sweep(args=None, env_name=None):
@@ -3016,22 +1827,6 @@ def puffer_type(value):
 
     args["train"]["use_rnn"] = args["rnn_name"] is not None
 
-    # Load driving behaviours eval config if specified
-    behaviours_config_path = args.get("eval", {}).get("driving_behaviours_eval_config")
-    if behaviours_config_path:
-        if isinstance(behaviours_config_path, str):
-            behaviours_config_path = behaviours_config_path.strip('"').strip("'")
-        if os.path.exists(behaviours_config_path):
-            print(f"Loading driving behaviours eval config from {behaviours_config_path}")
-            bp = configparser.ConfigParser(inline_comment_prefixes=(";", "#"))
-            bp.read(behaviours_config_path)
-            behaviours = {}
-            for section in bp.sections():
-                behaviours[section] = {k: puffer_type(v) for k, v in bp[section].items()}
-            args["driving_behaviours_eval"] = behaviours
-        else:
-            print(f"Warning: driving_behaviours_eval_config not found: {behaviours_config_path}")
-
     # Use World size to divide Num_Agents / minibatch size in DDP
     if "LOCAL_RANK" in os.environ:
         world_size = int(os.environ.get("WORLD_SIZE", 1))
@@ -3053,12 +1848,22 @@ def main():
     if mode == "train":
         train(env_name=env_name)
     elif mode == "eval":
-        eval(env_name=env_name)
-    elif mode == "eval_multi_scenarios":
-        eval_multi_scenarios(env_name=env_name)
-    elif mode == "eval_multi_scenarios_render":
-        eval_multi_scenarios_render(env_name=env_name)
-        print("")
+        # Pull --evaluator and --out from argv before load_config consumes them.
+        evaluator_name = None
+        out_path = None
+        i = 0
+        while i < len(sys.argv):
+            arg = sys.argv[i]
+            if arg == "--evaluator" and i + 1 < len(sys.argv):
+                evaluator_name = sys.argv[i + 1]
+                del sys.argv[i : i + 2]
+                continue
+            if arg == "--out" and i + 1 < len(sys.argv):
+                out_path = sys.argv[i + 1]
+                del sys.argv[i : i + 2]
+                continue
+            i += 1
+        eval(env_name=env_name, evaluator_name=evaluator_name, out_path=out_path)
     elif mode == "sweep":
         sweep(env_name=env_name)
     elif mode == "controlled_exp":
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index a425dcef3..9efc0c628 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -8,309 +8,6 @@
 import json
 
 
-def run_human_replay_eval_in_subprocess(config, logger, global_step):
-    """
-    Run human replay evaluation in a subprocess and log metrics to wandb.
-
-    """
-    try:
-        run_id = logger.run_id
-        model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
-        model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
-
-        if not model_files:
-            print("No model files found for human replay evaluation")
-            return
-
-        latest_cpt = max(model_files, key=os.path.getctime)
-
-        # Prepare evaluation command
-        eval_config = config["eval"]
-        cmd = [
-            sys.executable,
-            "-m",
-            "pufferlib.pufferl",
-            "eval",
-            config["env"],
-            "--load-model-path",
-            latest_cpt,
-            "--eval.wosac-realism-eval",
-            "False",
-            "--eval.human-replay-eval",
-            "True",
-            "--eval.human-replay-num-agents",
-            str(eval_config["human_replay_num_agents"]),
-            "--eval.human-replay-control-mode",
-            str(eval_config["human_replay_control_mode"]),
-        ]
-
-        # Run human replay evaluation in subprocess
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
-
-        if result.returncode == 0:
-            # Extract JSON from stdout between markers
-            stdout = result.stdout
-            if "HUMAN_REPLAY_METRICS_START" in stdout and "HUMAN_REPLAY_METRICS_END" in stdout:
-                start = stdout.find("HUMAN_REPLAY_METRICS_START") + len("HUMAN_REPLAY_METRICS_START")
-                end = stdout.find("HUMAN_REPLAY_METRICS_END")
-                json_str = stdout[start:end].strip()
-                human_replay_metrics = json.loads(json_str)
-
-                # Log to wandb if available
-                if hasattr(logger, "wandb") and logger.wandb:
-                    logger.wandb.log(
-                        {
-                            "eval/human_replay_collision_rate": human_replay_metrics["collision_rate"],
-                            "eval/human_replay_offroad_rate": human_replay_metrics["offroad_rate"],
-                            "eval/human_replay_completion_rate": human_replay_metrics["completion_rate"],
-                        },
-                        step=global_step,
-                    )
-        else:
-            print(f"Human replay evaluation failed with exit code {result.returncode}: {result.stderr}")
-
-    except subprocess.TimeoutExpired:
-        print("Human replay evaluation timed out")
-    except Exception as e:
-        print(f"Failed to run human replay evaluation: {e}")
-
-
-def run_wosac_eval_in_subprocess(config, logger, global_step):
-    """
-    Run WOSAC evaluation in a subprocess and log metrics to wandb.
-
-    Args:
-        config: Configuration dictionary containing data_dir, env, and wosac settings
-        logger: Logger object with run_id and optional wandb attribute
-        epoch: Current training epoch
-        global_step: Current global training step
-
-    Returns:
-        None. Prints error messages if evaluation fails.
-    """
-    try:
-        run_id = logger.run_id
-        model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
-        model_files = glob.glob(os.path.join(model_dir, "model_*.pt"))
-
-        if not model_files:
-            print("No model files found for WOSAC evaluation")
-            return
-
-        latest_cpt = max(model_files, key=os.path.getctime)
-
-        # Prepare evaluation command
-        eval_config = config.get("eval", {})
-        cmd = [
-            sys.executable,
-            "-m",
-            "pufferlib.pufferl",
-            "eval",
-            config["env"],
-            "--load-model-path",
-            latest_cpt,
-            "--eval.wosac-realism-eval",
-            "True",
-            "--eval.wosac-num-agents",
-            str(eval_config.get("wosac_num_agents", 256)),
-            "--eval.wosac-init-mode",
-            str(eval_config.get("wosac_init_mode", "create_all_valid")),
-            "--eval.wosac-control-mode",
-            str(eval_config.get("wosac_control_mode", "control_wosac")),
-            "--eval.wosac-init-steps",
-            str(eval_config.get("wosac_init_steps", 10)),
-            "--eval.wosac-goal-radius",
-            str(eval_config.get("wosac_goal_radius", 2.0)),
-            "--eval.wosac-sanity-check",
-            str(eval_config.get("wosac_sanity_check", False)),
-            "--eval.wosac-aggregate-results",
-            str(eval_config.get("wosac_aggregate_results", True)),
-        ]
-
-        # Run WOSAC evaluation in subprocess
-        result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
-
-        if result.returncode == 0:
-            # Extract JSON from stdout between markers
-            stdout = result.stdout
-            if "WOSAC_METRICS_START" in stdout and "WOSAC_METRICS_END" in stdout:
-                start = stdout.find("WOSAC_METRICS_START") + len("WOSAC_METRICS_START")
-                end = stdout.find("WOSAC_METRICS_END")
-                json_str = stdout[start:end].strip()
-                wosac_metrics = json.loads(json_str)
-
-                # Log to wandb if available
-                if hasattr(logger, "wandb") and logger.wandb:
-                    logger.wandb.log(
-                        {
-                            "eval/wosac_realism_meta_score": wosac_metrics["realism_meta_score"],
-                            "eval/wosac_ade": wosac_metrics["ade"],
-                            "eval/wosac_min_ade": wosac_metrics["min_ade"],
-                            "eval/wosac_total_num_agents": wosac_metrics["total_num_agents"],
-                        },
-                        step=global_step,
-                    )
-        else:
-            print(f"WOSAC evaluation failed with exit code {result.returncode}")
-            print(f"Error: {result.stderr}")
-
-            # Check for memory issues
-            stderr_lower = result.stderr.lower()
-            if "out of memory" in stderr_lower or "cuda out of memory" in stderr_lower:
-                print("GPU out of memory. Skipping this WOSAC evaluation.")
-
-    except subprocess.TimeoutExpired:
-        print("WOSAC evaluation timed out after 600 seconds")
-    except MemoryError as e:
-        print(f"WOSAC evaluation ran out of memory. Skipping this evaluation: {e}")
-    except Exception as e:
-        print(f"Failed to run WOSAC evaluation: {type(e).__name__}: {e}")
-
-
-def run_driving_behaviours_eval_in_subprocess(config, logger, global_step, behaviours_config):
-    """
-    Run driving behaviours evaluation for each of the specified scenario classes in a subprocess.
-
-    For each class defined in behaviours_config, calls `puffer eval puffer_drive` with:
-      - simulation_mode=replay, control_mode=control_sdc_only, init_mode=create_all_valid
-      - map_dir and num_agents from the class config
-    Parses HUMAN_REPLAY_METRICS_START/END JSON from stdout and logs to wandb under
-    driving_behaviours/<class_name>/<metric>.
-    """
-    sampled_dirs = []  # temp symlink dirs created for num_scenarios sampling
-    try:
-        run_id = logger.run_id
-        model_dir = os.path.join(config["data_dir"], f"{config['env']}_{run_id}")
-        model_files = glob.glob(os.path.join(model_dir, "models", "model_*.pt"))
-
-        if not model_files:
-            print("DrivingBehavioursEval: no model files found, skipping.")
-            return
-
-        latest_cpt = max(model_files, key=os.path.getctime)
-        EVAL_SECTIONS_PREFIX = "eval_"
-        classes = [(name, cfg) for name, cfg in behaviours_config.items() if name.startswith(EVAL_SECTIONS_PREFIX)]
-
-        all_results = {}
-        for class_name, class_cfg in classes:
-            map_dir = class_cfg.get("map_dir", "")
-            if isinstance(map_dir, str):
-                map_dir = map_dir.strip('"').strip("'")
-            if not os.path.isdir(map_dir):
-                print(
-                    f"DrivingBehavioursEval [{class_name[len(EVAL_SECTIONS_PREFIX) :]}]: map_dir not found, skipping ({map_dir})"
-                )
-                continue
-            all_bins = [f for f in os.listdir(map_dir) if f.endswith(".bin")]
-            if not all_bins:
-                print(
-                    f"DrivingBehavioursEval [{class_name[len(EVAL_SECTIONS_PREFIX) :]}]: no .bin files in {map_dir}, skipping"
-                )
-                continue
-            # Optional cap: random-sample N bins each eval pass via a fresh
-            # symlink dir. Different scenes per pass; better population estimate
-            # without paying for the full directory.
-            num_scenarios = class_cfg.get("num_scenarios")
-            if num_scenarios and int(num_scenarios) < len(all_bins):
-                k = int(num_scenarios)
-                sampled = random.sample(all_bins, k)
-                tmp_dir = tempfile.mkdtemp(prefix=f"db_eval_{class_name}_")
-                for fname in sampled:
-                    os.symlink(os.path.join(map_dir, fname), os.path.join(tmp_dir, fname))
-                map_dir = tmp_dir
-                sampled_dirs.append(tmp_dir)
-            num_agents = len([f for f in os.listdir(map_dir) if f.endswith(".bin")])
-            scenario_length = class_cfg.get("scenario_length", 201)
-            short = class_name[len(EVAL_SECTIONS_PREFIX) :]
-
-            cmd = [
-                sys.executable,
-                "-m",
-                "pufferlib.pufferl",
-                "eval",
-                config["env"],
-                "--load-model-path",
-                latest_cpt,
-                "--eval.wosac-realism-eval",
-                "False",
-                "--eval.human-replay-eval",
-                "True",
-                "--eval.map-dir",
-                map_dir,
-                "--eval.human-replay-num-agents",
-                str(num_agents),
-                "--eval.human-replay-control-mode",
-                str(config["eval"].get("human_replay_control_mode", "control_sdc_only")),
-                "--env.simulation-mode",
-                "replay",
-                "--env.init-mode",
-                "create_all_valid",
-                "--eval.scenario-length",
-                str(scenario_length),
-                # Clean-eval overrides. Mirrors build_eval_overrides(clean=True):
-                # red lights enforced, no road-segment dropout, no partner
-                # blindness or phantom braking, wider partner budget. Subprocess
-                # re-parses the ini so training-time CLI overrides don't leak in
-                # here. (eval_mode is on ev/clean-eval branch, not this one.)
-                "--env.traffic-light-behavior",
-                "1",
-                "--env.lane-segment-dropout",
-                "0.0",
-                "--env.boundary-segment-dropout",
-                "0.0",
-                "--env.partner-blindness-prob",
-                "0.0",
-                "--env.phantom-braking-prob",
-                "0.0",
-                "--env.phantom-braking-trigger-prob",
-                "0.0",
-                "--env.max-partner-observations",
-                "32",
-            ]
-
-            print(f"DrivingBehavioursEval: running class '{short}' with map_dir={map_dir}")
-            try:
-                result = subprocess.run(cmd, capture_output=True, text=True, timeout=600, cwd=os.getcwd())
-                if result.returncode == 0:
-                    stdout = result.stdout
-                    if "HUMAN_REPLAY_METRICS_START" in stdout and "HUMAN_REPLAY_METRICS_END" in stdout:
-                        start = stdout.find("HUMAN_REPLAY_METRICS_START") + len("HUMAN_REPLAY_METRICS_START")
-                        end = stdout.find("HUMAN_REPLAY_METRICS_END")
-                        metrics = json.loads(stdout[start:end].strip())
-                        all_results[class_name] = metrics
-                        print(f"DrivingBehavioursEval [{short}]: {metrics}")
-                    else:
-                        print(f"DrivingBehavioursEval [{short}]: no metrics found in output")
-                else:
-                    print(
-                        f"DrivingBehavioursEval [{short}]: subprocess failed (exit {result.returncode}): {result.stderr[-500:]}"
-                    )
-            except subprocess.TimeoutExpired:
-                print(f"DrivingBehavioursEval [{short}]: timed out")
-            except Exception as e:
-                print(f"DrivingBehavioursEval [{short}]: error: {e}")
-
-        # Log all class results to wandb
-        if hasattr(logger, "wandb") and logger.wandb and all_results:
-            payload = {}
-            for class_name, metrics in all_results.items():
-                short = class_name[len(EVAL_SECTIONS_PREFIX) :]
-                for k, v in metrics.items():
-                    try:
-                        payload[f"driving_behaviours/{short}/{k}"] = float(v)
-                    except (TypeError, ValueError):
-                        pass
-            if payload:
-                payload["train_step"] = global_step
-                logger.wandb.log(payload, step=global_step)
-
-    except Exception as e:
-        print(f"DrivingBehavioursEval: unexpected error: {e}")
-    finally:
-        for d in sampled_dirs:
-            shutil.rmtree(d, ignore_errors=True)
-
-
 def render_videos(config, vecenv, logger, epoch, global_step, bin_path):
     """
     Generate and log training videos using C-based rendering.

From 88153155038c04fc3e28c8d3b9a464d582bf83fc Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@Eugenes-MacBook-Air.local>
Date: Sat, 9 May 2026 18:32:19 -0400
Subject: [PATCH 04/26] [WIP] eval: explicit goal_advance_mode knob; replaces
 if-SIMULATION_REPLAY in c_step
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add GOAL_ADVANCE_REGENERATE / GOAL_ADVANCE_SATURATE constants and a
goal_advance_mode field on Drive. c_step's last-goal branch now
dispatches on goal_advance_mode instead of simulation_mode.

Drive.__init__ accepts goal_advance_mode kwarg with auto-pick based
on simulation_mode (gigaflow → regenerate, replay → saturate) — same
behavior as before, but the choice is explicit and per-eval-overridable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/drive/binding.c |  1 +
 pufferlib/ocean/drive/drive.h   | 20 ++++++++++++++++----
 pufferlib/ocean/drive/drive.py  | 17 +++++++++++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
index b2a4c20b1..0dd93a5e5 100644
--- a/pufferlib/ocean/drive/binding.c
+++ b/pufferlib/ocean/drive/binding.c
@@ -1788,6 +1788,7 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
     env->init_mode = (int)unpack(kwargs, "init_mode");
     env->control_mode = (int)unpack(kwargs, "control_mode");
     env->simulation_mode = (int)unpack(kwargs, "simulation_mode");
+    env->goal_advance_mode = (int)unpack(kwargs, "goal_advance_mode");
     env->reward_conditioning = (bool)unpack(kwargs, "reward_conditioning");
     env->reward_randomization = (bool)unpack(kwargs, "reward_randomization");
     env->compute_eval_metrics = (bool)unpack(kwargs, "compute_eval_metrics");
diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 17ee20987..701eb1b9f 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -64,6 +64,16 @@
 #define SIMULATION_GIGAFLOW 0
 #define SIMULATION_REPLAY 1
 
+// Goal advance modes — chosen when the SDC reaches the last goal in its
+// sequence. REGENERATE recomputes a fresh set along the route (the
+// gigaflow training pattern). SATURATE leaves the goal queue at its
+// final state so the reached-goal condition won't fire again (the
+// replay-mode pattern, where regenerating would dereference NULL paths
+// for nuPlan bins without route info). Defaults to REGENERATE for
+// gigaflow and SATURATE for replay; the Python config layer chooses.
+#define GOAL_ADVANCE_REGENERATE 0
+#define GOAL_ADVANCE_SATURATE 1
+
 // Lane selection scoring
 #define LANE_SELECTION_DISTANCE_WEIGHT 0.7f
 #define LANE_SELECTION_HEADING_WEIGHT 0.3f
@@ -336,6 +346,7 @@ struct Drive {
     int init_mode;
     int control_mode;
     int simulation_mode;
+    int goal_advance_mode;
     int termination_mode;
     float inactive_agent_threshold;
     int reward_conditioning;
@@ -4866,10 +4877,11 @@ void c_step(Drive *env) {
             if (agent->current_goal_idx == env->num_target_waypoints) {
                 // Last goal reached
                 env->logs[i].num_goals_reached += 1;
-                if (env->simulation_mode == SIMULATION_REPLAY) {
-                    // Replay mode: leave current_goal_idx saturated so the
-                    // reached-goal condition won't fire again. Re-generating
-                    // route-based goals on WOMD maps fails (removed=1).
+                if (env->goal_advance_mode == GOAL_ADVANCE_SATURATE) {
+                    // Leave current_goal_idx saturated so the reached-goal
+                    // condition won't fire again. Used by replay evals where
+                    // regenerating route-based goals on WOMD/nuPlan bins
+                    // would fail (path NULL or removed=1).
                 } else {
                     compute_goals(env, agent_idx);
                 }
diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py
index 7dc537ca5..894448dca 100644
--- a/pufferlib/ocean/drive/drive.py
+++ b/pufferlib/ocean/drive/drive.py
@@ -57,6 +57,7 @@ def __init__(
         action_type="discrete",
         dynamics_model="classic",
         simulation_mode="gigaflow",
+        goal_advance_mode=None,
         termination_mode=0,
         inactive_agent_threshold=0.4,
         buf=None,
@@ -229,6 +230,21 @@ def __init__(
         else:
             raise ValueError(f"simulation_mode must be one of 'gigaflow' or 'replay'. Got: {self.simulation_mode_str}")
 
+        # goal_advance_mode controls what happens when the SDC reaches the
+        # last goal in its sequence. None → auto-pick based on simulation_mode
+        # (gigaflow=regenerate, replay=saturate). Explicit values: "regenerate"
+        # or "saturate".
+        if goal_advance_mode is None:
+            self.goal_advance_mode = 1 if self.simulation_mode == 1 else 0
+        elif goal_advance_mode == "regenerate":
+            self.goal_advance_mode = 0
+        elif goal_advance_mode == "saturate":
+            self.goal_advance_mode = 1
+        else:
+            raise ValueError(
+                f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}"
+            )
+
         if self.control_mode_str == "control_vehicles":
             self.control_mode = 0
         elif self.control_mode_str == "control_agents":
@@ -387,6 +403,7 @@ def _env_init_kwargs(self, map_file, max_agents):
             "init_mode": self.init_mode,
             "control_mode": self.control_mode,
             "simulation_mode": self.simulation_mode,
+            "goal_advance_mode": self.goal_advance_mode,
             "reward_conditioning": self.reward_conditioning,
             "reward_randomization": self.reward_randomization,
             "compute_eval_metrics": self.compute_eval_metrics,

From a6dd740a907cf582107478352624257f3db981d6 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@Eugenes-MacBook-Air.local>
Date: Sat, 9 May 2026 18:32:56 -0400
Subject: [PATCH 05/26] README: cluster section now framed for NYU specifically

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c1e2e137e..2fc82a481 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ python setup.py build_ext --inplace --force
 
 ## Install (HPC cluster)
 
-For clusters where the host glibc is too old or you need a CUDA toolchain that's not pinned by the OS, PufferDrive uses a **mixed Singularity + venv** layout:
+For the NYU cluster, PufferDrive recommends a **mixed Singularity + venv** layout:
 
 - **Singularity image** (read-only, system-wide): supplies CUDA + cuDNN.
 - **ext3 overlay** (writable via `--fakeroot`, host the miniforge3 base interpreter at `/ext3/miniforge3` only).

From 170183b32749fc466b505699c73fb9cbba158c9a Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@Eugenes-MacBook-Air.local>
Date: Sat, 9 May 2026 18:38:09 -0400
Subject: [PATCH 06/26] ruff-format: collapse goal_advance_mode raise to one
 line

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/drive/drive.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py
index 894448dca..1d9c4d081 100644
--- a/pufferlib/ocean/drive/drive.py
+++ b/pufferlib/ocean/drive/drive.py
@@ -241,9 +241,7 @@ def __init__(
         elif goal_advance_mode == "saturate":
             self.goal_advance_mode = 1
         else:
-            raise ValueError(
-                f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}"
-            )
+            raise ValueError(f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}")
 
         if self.control_mode_str == "control_vehicles":
             self.control_mode = 0

From eb21afd77b5e457492b0152d2d8615fecfa9bf11 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@Eugenes-MacBook-Air.local>
Date: Sat, 9 May 2026 18:46:22 -0400
Subject: [PATCH 07/26] [WIP] eval: factor shared rollout loop into Evaluator
 base class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HumanReplay and MultiScenario rollouts had ~80% the same step loop
(reset → forward_eval → step → collect infos → aggregate). Move it to
the base class as `_run_rollout_loop`. Subclasses override only the
hooks they actually need to diverge on:

  _initial_reset       (sync vs async reset)
  _maybe_reset_lstm    (per-scenario reset cadence)
  _should_stop         (termination condition)
  _flatten_infos       (multi-worker vs single-worker info shape)
  _aggregate_infos     (per-key mean is the default)
  _render_pass         (no-op default; MultiScenario implements EGL)

HumanReplayEvaluator now overrides only `env_overrides` + `_should_stop`.
BehaviorClassEvaluator still inherits HumanReplay; only adds the tmp
symlink dir for random sampling. WOSAC keeps its custom rollout —
its per-scene multi-rollout loop doesn't fit the default shape.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../__pycache__/__init__.cpython-313.pyc      | Bin 0 -> 1183 bytes
 .../__pycache__/base.cpython-313.pyc          | Bin 0 -> 9080 bytes
 .../behavior_class.cpython-313.pyc            | Bin 0 -> 3386 bytes
 .../__pycache__/human_replay.cpython-313.pyc  | Bin 0 -> 2580 bytes
 .../multi_scenario.cpython-313.pyc            | Bin 0 -> 9406 bytes
 .../__pycache__/wosac.cpython-313.pyc         | Bin 0 -> 2709 bytes
 pufferlib/ocean/benchmark/evaluators/base.py  | 115 ++++++++++++---
 .../benchmark/evaluators/behavior_class.py    |  15 +-
 .../benchmark/evaluators/human_replay.py      |  58 ++------
 .../benchmark/evaluators/multi_scenario.py    | 136 ++++++------------
 10 files changed, 156 insertions(+), 168 deletions(-)
 create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc
 create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc
 create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc
 create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc
 create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc
 create mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4d8790db2fa093442f5ecb0224609017549d6ba
GIT binary patch
literal 1183
zcma)5&2AGh5ccM$n{1jKswyCH@v%X$E5r$@Dk^ORBrPr3wg?e$on0qcW!JkJ@22XJ
zhu|SN@eVuz2M*OrJ#pgJ(gR1Ztu{$-^<gw)kLUa5^Z2S>w~$=lzx@k-mJs@-f}2&o
zcCJ22=L>QXAlGmUU87TgLZ=8ttm0y~)G0$*`=#zu#{^UR<!+@@g{t<Kx>lzKHH`Yz
zez|Y<Yp!XRVA(-#<*fnrQMF~!=LSNDJQxPd$81W&jHWT*A<&4O#4y1_8SQ7oA%krQ
zrp&giHuc81JxdkYzzeAmI6mRhaE2ic!#GQELerE1xADQ4=?lA*(-8x_mQY|kwG!%0
z<VFkof$-uf12Lx{B!@V{#~&0ad!0)@8y}CcU|t%;5-8x<*MtHdSd{y!h6MD+OyG3P
zBAE~dlFXW)LBK~i1u8T7sD))iRA5Ovf!Je0Fd>_A8N+_;2}i+^ObOyqg=9y*)<D+8
z!z>K3&qOfd81yL(r6Rx8gS85|r9|m5k%ehqoe9)3^5zRRrqdvXEhW_A$?B^tqP)kF
zke)8?H+N-`!1WlX5X6fojdy$Q=GJ_tls6)kOT=|@-WXh?5&bUt@>pRI&>(sFBo@?b
znfXflXmjUq^I)$>dhM65-GknHD9T1a&LaX-88wNpq)bJcH?2l)5+YS1ByS~pCc<D~
zOA#rz2aJ2;h{D8XH=`l!0TpasG=F-xOSkB268V!m-CS%`^S&eEZ2@)f`NKOQYB9As
zP?r)ZeP|dG@)cdl5aq-<lnThH%0`T{shq%6OirC`CMIc|I9svLoc;Ola%P~8R@IrQ
zs^?PwP2mut>JVZlr+H;Piv28P8*pFZ$-j)mGiff0hGAS5Or!L71sOZWIXXH=J#E_;
i7P|NNtoi6{b?vPF;Idk77Ji_WKaEPQ@Y#f>y!{_QRD>%4

literal 0
HcmV?d00001

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f0d416d296e9d3d20ebe3c8cd6143e5b68ff8b1
GIT binary patch
literal 9080
zcmb7KZERcDdA|InC`yz_eOR_^UD01+6j63#Df4G+M~-7VmK`xn)JOw!MP5obrtW3V
zCFRInQ6mLbrppj{scQx45M_U&76s<50p@>ohXUJAvCKy0#_No>+mK&VVgo_9AKUYu
z%S%$Uf*=Rbx#yebocDd7_v7e6ON&oH`rkYMr%Y`Vgx^!eO0G)b!HX!|6k-AsVz!w5
zg6+JW+0Q$eL!`Rng7dtKxkO>wzD)=VG3Rz6<{Gh^b4MymFgNdWqt9dYG0&_w(sb?V
zGYe8CFXc2Q&Pci}CNq+*i-TfHPD}YrPGnjpqvdmAM$@urrqL6!PCY3pC-KqI&**0&
zcB84<WVq6boJmD&hWBY&@I{FwD|_J|XQlFB6os3@w7_gQpFL({j=jRPbJ`WN??=zH
zJ1k6lVvfT$=AHJS)k&=}7i*ezM?8kFwwup9=rBC<a*ioU9lsl>93YXFk~rL=;Z8GY
zUe<pqj7NNiH<3_LOC$_mA~CO}@);`o6N%UIQl`=qNF>q<({mX`l~oPh&XkhO8EBSN
zRm({^MN{=eg3(zFXGYO;teJ|PiG)tQ$%K@{Zf5d1nTGk%_W*x_F*O|5Wu_0y`B_<&
z7Zg>==~otq$7KCVPRkBIt)=AQ%lUL#W@AiQkcYFpoKa?mwWKVm!!xp)oST=}m0{VU
zojy#&i)I%Y-2>04pm!nrfpEX2{reX`32j^LD2AffpDB#}t24?vt$~{e<3=l`2fsnq
zNSK(NIb#myns!fnh&!jf7C}HH_jFUtGu?E^9rJD%UUf2GtO+%H%*SiZm0B~e`B7^@
zpP$!SD}61z7KpWO7g$@REx_B_V{K>)qAi43?Yyl6wNBK6yw(*9O?UD+&R7R;>8|v5
z^4hjo7wehrj%+iUsto^G3(g>&vI)ZJ8t11W2vQ~^)~+_{^TqNroYQ4p%q?b>BnB*s
z+JelOl9EMDMeUV@KBwg~DbQZaz9J6f=44Z5ax$A&RL(8Fq^716;+u#lsVVUlSzSn2
z%jmB}#TPKi61-O`lJ|<p@~SRMqMnnpd`X|4S8~!!MiyzaVep+S&5K%El(UKs<`ES&
zt?6Q3%_$i%C$U*MCnhyiMGQ>N$*C|uosNUOpouB47>0;xrp=4AYik2s;$<wt<jmsZ
zwd2+{mO@uDmsyPZ48dr^&B$D;3@1sD;l^px%B<nfF-cJqW-Vz+KCWIMW^>^0!7^xY
zQ<!YaP%BT9Vg_xM@_|eR&jewL*qW!feWJ-K*S>_17Q&5GR)`B33rmG5qB&1rG2an$
zZZZGse$^FoTeI+MUh6l$cDJ6G*P2(i+v>TdNJx45irqHFe8aXaRv;ka>1%h;yybln
zSn)I~MEv81M^=g7QZE=@MFm3xb{LVwZ~{9)X+<TFG~7&9Q!;zT@SBCif+AnljV4{0
z=a48Hu4D!X7HMI<n9X)lwu`SbqiGq#0ZwHi)gAa5)=%AG%JxteqHHf^`;kSQhEta_
z>4ueb;x7@25%dwx6TAYvgH^GQT5UZTp|2pjF5K_f_Eyt|Tj<{TRx>_3BOiNv9=e5H
z;gx+KKK8+5tKYukEQODkcAh9W-}2wvDXw&Vxc!6etLN|7O8rMlJCCBdWxc2G{q65=
zUwLNr*;3C4TAJ?lS_7ZGEtmR_m3oh(x%pmr-^#*=*FLy*J9_u%-J#O{%cb!11t)&;
z-j2Q%_lLd@e5+619xI85OFNDfoPX)#SWUN^%QH1ul?&{=DnZYoa#NV9_Qi#Xs?4CP
zk|wHhV6{M-+71Yf9JB}8{Mz>~%WS9$gW?px>QAdEphv|q(?S3wYU0&7DJKI*IDqH?
z1z?=-D${Z@U<fS1p)UFuAg&Hdlag9pj~0<qDQ<%hFNs%FZAOn84*Y7w!P>E3lU*Vx
zMQm&kADvEHA#}?oC^fok)ZWxFMAL}lt_vV<TgTGatqV6U+`4q*(#o0DZ<RWtrNHo-
zcbH%J))$?|WEf+BnV2JLS8Z|IMD^0qRY~9A`kjwnzbM3QAb~4?AImjRVL?uAMTH|G
zObSKHh`OTAX5^u(8oPpP|K0T;`3Tc>DP040(i%+8W~9X-J+}xoLPXX<Lz$rr4Ix%$
zD^NQ=b3t)NQ7e-+(J%+Xi&rJ+maN2}7G-FhIi}@j=d$^nZ(vrLmC_mQs)%1uU>$PR
zXK6f)R6yb;)7CkoE__Be2|Lm=BPFi@N(`@>pHF~*y5YdNBM!rz&tet1juq^j6$~fY
zKRU6zCHgIxQB$t<9I>h6TbQCpkzE%yd_wR3+oy_w(Smc`>wmlX&E`k=<74medLUG2
zHaUdA#!4hwT$#Or!c74jumxXC2IGzq8ik}fx8{-ovs;jY0<E%z$kwlN$u=ygt{Tiw
zo{8BG+ev`)|AB=nLTZ`uM?gaz=D_d%{I5)Vf(l8P_z>BO*+4DQ>@1UKanTL7Pac9f
zBhN_-3Rq}Ck+`_eY1$PX%^-7Bykr?gL|Y%s@i{F86T`F<OiM{g^J(0?(C4Hq?3ElL
zxv0u=ila$NA>-FKAV+7TViqW*Ct)ay^YA*9A<H4j!n&53sz;)}h@Fwj<mNZ@LVE%h
zQ=&31LBGQl2+aG6TFSyB;Wl!_ZE}wrECDt%+*vK7Bo_@Qj3&LdrcN+RO2y(WPi1kV
zy@oTDrsEi@A4LX!Y867=<={XuII!Avr=t`+QShuEJNe5Czr3*2QEm?x+rulPw-0^X
z{wM!DQE=Z2^sMYJ1`e-z4?oIWRh)N%<v9EZ7r;P(A>o2eb@8uVb;lhsJMl}*LAmMy
zIMK|8G*PurY8^2fpWy|=Rc(h_@8lbk`bdS@)h3LoT2XalLY5X$ePGGvm`(L#WS};}
zDoq}Z`z;}`nEk9RrE_*Cq;E)*KdA`~pwTr!mOw+YJYBoMM^x>m+OW9Kvalw)Ypqr(
z?y-8+7T8*38Z@NIPptFh0W;bEj=LweHw-~_f^3?G6mN|=W3I#YJOp2D69qL8Z=2|A
z7=)^IhOL03+SmW+JWx6OXzNx~uAZ?ihIm4(s`o`<YOl4zORc!!eO9gZov=t%YpTw-
zx+E}u+&MwYxgovO_7%H)ykQ`!UonzEx*=Iyq|)jK8#+;asm(fZ9_+39L~w^1P}}3p
z6Hh!k8r*7Kb6sb=-CD!?g2VhV_Yh>@n{apv)IV#xXma_Jbw5xk#+$A6_Xz#M<AQak
zDX5XOT@?DTcTpJsm*b8;A#ICz#!U;{#l0)TokAcXDZhZoM@vdJeNIFqlq91gMS2Yv
zRB3k6a3L0woHLrLF$JR?b{1Lmq?;3ZR!YK3=df(FnYMFA&spYsD{q0B0VIVQ0ixD9
zy};dhX>mq2`w}$G@E~w9%cPXiB<YEqtm+yw>^aTw!v?u3u~dRg55tpPG`H2FOY>QT
zVlvt+bb{eXX7h#zKSf%?@FS2@+oMl`VS+e|iv6~vDELB=7WC_edt77lSk{@$C|Sb+
zy<|A&l-W5r3~-?hxBNPGk}_Ho=?v)*6^DYqjN?|-p4k!!a_zuQ;s)6%%1$FQ>}r;s
zr`iR|&Qg<)>=X&n9BkS}!5`Uh;989UUm311SFm!5I_0cxIJKEX&Tv<bX-gPx^K6ET
zPtc+H1=2Q~6L)FY3HreuHNEqQvU?T-^pC(b*M(2L+duVo{NB6metYNA?2VnvuNT_~
z3hwpJ9p%oUV&_n~^NC{T6EJx?M+?67wvKY!-eTL{l~?cVE47_1xbC+G%dP#z*8Y`!
zrPkqs<GvU!1lHTy3$FEGS2?)57~H)wc4wp<JoTI4sdax_*&i<Y!z&}VJ3jW0-f!=^
zwe!Z#n|<ruy|=!1<9jQ5se7>89W8c8R~Jg%qf5^9-f+2hxY#><+ga*8cB5&@z1}xa
z?mJNIJ5cUBQS3W$XJIY=?X|uWYke=@@GrTSZOaEMPS$(9<(|P}&tR!%sN6GB>>0V8
zE{~ooj-D%zUMh}WD)n4ma(~$*1p6QQgtqO=`&Rar!z0D;NI8707(P}G94`irulMYD
zzx&<pm9EvUa^zSsa;zLVS&W>#yRUpYUOXKypMJS``sGs3)ROytc;Lgq4+dA$B`o6a
zD?Gi_zI1BowdK9*{vDMwA1d!2E$$w@oiFV^{;~hWy1#wt`8UrO&TQ=9HbC7~eH=Ws
zu?wz8Ik2Y~*t3!-1xD7qBVYDlvp4(xM<>ntkAc&nvj-jjbkKM9DNnLuE}~r!pm|Iq
z^Dz`)WNztvSRlsr)S<Wo#-Q!m(FyK7a?^`kV)7O0B){s;!I4-*c#&zUme);dF$z#|
zae<Gx>Lh}Mfz44ik4z`~f%B`$rk7APw$-s|<ws3)P-sR6GPvID-M4yg^uDv}O?Sbu
z9tajpcQ;AvS&6JWx`qcIA;Ym@_)zy&iW1=rus)?Kfa1~)2zy`@na0vob!8~JYGB8l
zL-}NIE7b|p%6ZuF&1aC}2LasU>=~?Wc=9>`QdK#4!;dBSfXCzJAUjw2CUMW?=bA7>
z7Lnn)CNp4E^PG}O0kL%Dnp`nnS(-+2c%sEPM)8S_Rx2B)g|;`4gyXasol()RPa?Z6
zz{goS`{sAc0kIemS9;2U!D3)=J=9q^2R!*hAhaIZcI)Vkqs!OIgU5@5$M2?>j;@6+
zZr%F1%hBfB5FAYv!dC1^{Ei?HsW@S&@uo0qL)_uWvv4SQXp{3?4i2sOT<nF5a99AW
z;=Gcwe1JNpRka~48y)9S_K4Hu(pjv@UZsp!b#r8hJ&&eMFxsfQUZ*+4DqvBs|51~?
zy7g9!LFfg!nPeNpZSXmIFERBnSFM}8=J~vytA;*Eyh7tz?i{JUdFT*1b%rsGS;ydV
zMgqgn(r}E+Xg9j*Xj-R`{|tS)7uj{;53T)c-hT5C5Ql1_^YQ!tdlYU8u-ol%TfN}K
z!Mt%>NO<-PtQ|+(QQvnFQNp^2lVhs7vSle-qa|4Eu&S$qSN(kfmVHHBMd9KhK57e*
z+%yVY-YmcdBeAkwyKY7g@H&Fn5sw-WYEhs}lof<N5X~3qZDvSI57Ac~Fx^Nq#1RoS
z=94Nx6FyEHsD>k~1tU@M8PihWQ7Us7!tL}Nq(?y}qP-cOoIwf7Je!4qVbM>cHV59&
z42PbdX`o+{I_qI_lTC6Ry*im0a3<YCpuOPuZJ=X4)KxhDX=q#F>EDLBK55$q$8mZ0
zPjaR{S#Jy7^4;()Kk?(h{aW$!7NMgD?r2-6g074<m8{YWl`6zC(1SFKMY|M4lfqpG
zY=BS%z2JA^oo<S_hNn6>wOO>Zz6WfoqZXR5^wL)FYQ9_&r&*&$DL~=`=goohT-5WT
zTelAjx*ct>-c|Qhv$Y5KVq18mahJ7^DUwm1YCL+V`c6<iDemLqFcg4HjC$FA5nCZn
zQ$6skJ)1@$+*-Q<a5iCdL#la{)-EPT_G8RNAQI<a=w;n5;!ky_5$wA*R*BUd5n)Xt
zn3oji5i6(ohj>NC157ppg)C8Lg5n_(Q#2i>C7}i&!+hb18r@&f&lS%2rkEy?kldgL
zCz0*OK|yu_mkYhDwQr?cEz&FqH(d}{MA+#dv%-@OI3%`l;FuFT)uGSq(E|oNdSWmu
zX3}(wUOQjUvJ=Ko9bPrq+|gqh4bN-(iD-5BkFf}U$ovdqu+}^Bv;1oR`!Bxx;`@`!
zlRwM<kqguty}N9YCUy=(@${V0BnVqZRv~flQe1fzet{{a5sYADJmKkrULw4vsvvP-
zGG%X2R-gg4E9{5#by9Sc<EUkhn-)%rI_uSG!%^Nwzy2<=>%xCRYCDEjUt0?uT=O4<
zRenD(@bkSNIc}f%g|GD3(Y4lN1;@QW@GtUjf9HqaDfW*1G+*i+S)(}Qr|q4EQ=f$Q
zmqO!zvv>LVpC7p`{e%2>@|{z^nEQ3;*YaI|dGO*pV@s!gH1@&X!q{49{NE9s%-!qR
zwanhX`tH@0+<V`JjlAS|$NjlS2zG6FIR^ak3y;uo$)*Fqg<rKqPakss>M1Ysq!SHR
z0uGqEfkH^68#vycn&59dB1Qc+=`<b**7Mn{#&UQ}#0#L9l(GO~JdoB01*4mE^hvB;
z^Q{`d@hNm}Qf_teyT3-CPO1Xx8Nk@YbsJcKM9d5ii&W~TEKC`HaoR(ry_C@-z<6Xk
zBivvF4=g}g8#2R_NTjqR-t2c!6M+F3DaRPOW#&@^MJ?Dt$`J7|-`m;KR3h(;U7~E9
zGO{ulAqOKxZR&_5l~R<^`50Nj{7U!<OxD7`f(VQ<+MWKl$bKMvZuhvn8$m$~Uq4rP
zecAEmcT3Lx4QHF{=yGgBz-M(2|J+GEq}s-S!*yUe_fWuR!{>2zP#K?%t|r%!)oTw0
zd^Wmvx<>DGJQVQx;<U}-3T^njF5lg;hXOudbl6=j8(zCBd<PBqY<OE-p%pBH&&Do4
z%Bc+jpH(#Bv(amJ1#gdiiJ?LBEOdf#h}bQBjpph<hsZi3=rR9s!c7nA>MXxy9+Tzg
zZnzM`d(8YNjT7uW%p}g#3Ar{LHk<7e;oN@-1OF!MfQqy?e`&YdhBpLc4;-j&Q1wgS
PgxwbW0#9N#b7%h#2{9Vj

literal 0
HcmV?d00001

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae87773ca68e75824b82e22934aa030ae1b091c4
GIT binary patch
literal 3386
zcmb7GU2Gf25#Hk+$rD9NhG@&OOiD*Zg2bvNZC9?KL^b@QII;yv4jDQsq~?4gm()q*
z9lLi_BBKTLR;n*`gBX_D08-EbQQuOuK;@!<jXbq)rs<T#MlFiAdB~emB}n?z&K`OC
zi~P_fv9o(KJ3BkO^Udt~d>#btldb=%oExEkvWeZC^}+6K7~Ds3L{OZIJH|M|S=2!s
z7Uc<VQGp0R`7!5B7ja2URv2^N6p1LIHq?dU&UO@c&4?HInqJb7afIE4r!c#U=afZN
zC)cuyVZ6MkWDAO^6X_qz??}3aC9QBPt7y_}T}nzSCeG+&Nm5KnkxVs*BOcEUZ4MLF
zG^Ei&PSIjGpH-IjRP`ECR@d_p=}IbPNP6B>bxp}iX@ZS8i6~l1&q;=o%V$+B<H1UD
zPRfHN(wlQwlkRARoNOeqrVv#(?nuLushhBPR@DrtZ&6j44Y|BzEahPNf|OE8IPxhw
z<3yOFqIIU*3W4#-<4jA|yix1xu0gv~Fn<G_K7+v7h&h_@Jt*!V!i;mq#cpQC9p`(2
zCeQ#lx*g4fqvOt*h5?85#l^nN3&az5w<FRR7l}84zRM9`Hwrh<*5f_?X)`FKO}3-u
zB)bpVU@qmctSLF16o79xW(2c$cL2Ie+<i1{bI@x-0(~F&Q!NSJzHWFBU~AT43uRE4
zn+WsNX%zC9P*IjuO*LhidS&Y-a7t#LbW*{<*)$c{{?wJ!wX~X{UXv)QCf8>Ta?Mg%
zHkeTov}KB;2?G-&iVGR6;YFt0SXhdVU}M45^U-U13P*1h(rHXah`NZQd25_iXQO%&
zD_V3GYstBsLKdRfevk%ytUc7)ev$kV@xq3*^LTeg>@&51kkJm^SIcNy^sTr{V%zG;
zjUPS}2dV+7D10QI*#$15E`rSh(UKgS)td;^@^c#cBYqZZe!f^2)wl!~V=g$-Sa@En
zdtP%S9DQ}Z(v8rx%NCBYosP6v<B<<AO*k&r?X&fyKzD-cM$<l9I)P$M_IRHjYa4$K
z*|r_a7vfHvZ||$|2dqA^uiqYZ9k9`lZVQGZiVTMzjRp|XM`C_kZlA_ow(f&x8izYe
z2)7*L9SAid329DENbqszi|-(${26%s^eJ0M!f9)G5w2VL19-O8=Ed6V(LU`$Q*rxT
znroW94y(n$#-k0+gJNLA(bk0G?r%Hd;t&UswdaR~3r;E{q`41xsq5%k1dx1yf85>o
ze0|MSuWj@HzZT9QeO+V1ZELitgvOPczn`-L(hozvsq^uMI`vpc$X?(v_mSXga?j!I
ztu-7Hyb?`Fqj2K{b?1~kLm-N%zz~4?4&?#D!s)IMfh<5d8Nl6@CpfLXMLkSR0n9NC
zDzY)?F~WYr0*U&twkYe1m=HCE4a#S*NgcXDg}h?UQKxFK4S7{F3A;_|&Z>sVK8aXM
z8E>i}!)LJuFkz)E>Z&0n6-}JWr`0S5s~1viLKu=07LL?iPfS#p*HtatKv-Z>9xNg3
zS(3AC%B88OWV13$YOsQ(LY$(`G(%lt!B+!_F=S<JnXnKd5?k5*9PB%EQWgou1fT{3
zz$X7nrquL8xv8hfS6fa!=>1vmYIq}7Zs{w!|JL04?(BMIEwi3m%Wb|==@=+=3{?HC
z#hd?XJG(lvKDIWtKDjozdHsW{<-lMi@KP!8($?gkUM&YE%Wb!czU|J?`rV)3-RynV
zd7<pT@wD^8YUua#znOpBTKUfCgDa)Z3q|pX|3<a3ZS{P)F|ZN*o&STW-v-N}OWUo1
z2f4R%8>8jc;o>XXQeQ<n_f$H!HSlQY;m}t9<5#zuOVVg@<e4}4@fqo!bNfWg1JB!@
zcQ33Dt_`jauMKay%V!7w*z)M~!_$uf4+D==mFusUuD|}|dvBD7<iDJ_^BIrYLw^s3
zD#1u87%5)6*RV2CmA-Y)Tk;009sTz_E0a4SI^i#RK6j!}_r~ZieIUfR+Z~~ezF*{E
z1~S#=mZIlB|2~6G-sB9HeLf7FxgHTd9CAT_C~>evU?mP#V%=vD0`zFJ;ZA`fZ@Qi$
z5Q-g4^z|h%_tCWgfXC{zfpAYZMCbG#z{ISI^RO-p;0}MXW-M_HL0DR)!yZ3_*e}da
zyasXSQ!|=QunaM!6C>=h9z9`jM_ynZq&Uc{#$3Tvvk*8rlVBX?tROI08k4NIOx?*W
zWc@;(c0A`j*ZKXh#2AHc8C4sbD~$p8|ElHv)9;;rKlom7)2u`<m!g*|p(~}(m2Gd!
zPse{UzS>=B?=Q9YZw);4Uam^r#gUJ^!5t^+y2?Fi3=oD`U=WMda1*P8Sj<5k<dNl^
zo+@P7*elCFC@9%Fr&*TMDltqbH?gM6GGVbv*i}>BFbM^|qG?d1vNG2IzEz|MDK%+Y
zF-wM+=qV_dm`GAqOp~ay1rviT_0@~1BnCJDV5>zx^VH>9)U$elu#Q2D@lEKy`Z20@
zMwf3C-&*BYepD8sUkDCoXh-xr{j27e2*{V;;GI1?2s-Py*7C52x@9?~C&57Ly24EH
zAnTZusSC<ZD93B(I=@$*MXd5HvR9r(_RF)#94q~0V%7eHJz;C3U!&Tl46~>>TyyFr
x!Y+{=%R0gi&vD#m4xSt3c0Gvme2je7zxl7|>?bJpxvP`wc@jMT1p;bm`yZi5`9A;v

literal 0
HcmV?d00001

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1786e70901f1ae0130b6edeca7ba39c583906961
GIT binary patch
literal 2580
zcmZt|TWk|YaPQ&sBTgL%2_&K9NJGIu>_QSD5<)~FJVHUpCnF?Mj*D;YTshyF-E$yR
zmH2D;B9V%4q)J5nN~KDrRjcyNSNm05BVny*g<AE4KWM0`ezmjb*o45y+Ij5E?A+|k
zte2acBM9L8d;eLB3POKyrVfD>V)-Btw~&Si(gaOBDG&j$bW$9bh@^6vd{Q0{kbsId
zp&q0KI+3PK1O}y=JyCLEJXjP3gpL>UhLget+n5=fHteF|xkUZ_`e!QfWoq6vvARpm
zxQ<6$Tc>75cO83Xx9U1r1&QqzJ+**IWYTg}_<2)U%^L;Xw8(BXj}1q4vwXh~7c9zs
zSx(kfO)KNkL?m*;nZm^KsBdQCtWmVRhF9Bl3u>0QdG%r=w2Q-Vusx%?)0hy;#58hI
zr&DgxHg)Q`g^Q}^svahJ%P~9*su`?eP@PeEcvA*d3&b^x8EpFT^H>s3c}`Bp1r|DN
z8<d_k2+-liiJ4l;YU@h=(DG5>Z=neUc}GOl1R`xinn>h{z=XmtX(Fge+W~9xL})<d
z`Ts~JVc;UbDL&Vv1xYj)iig>TH(kQhW-w>bDSkhC?J`){n<-%3G4hxx1bao|WZ()L
z5CV!vxcmi3p*o;4G)a@SfTn1{gyINkA+=h6#W0OhV#7y+NY!htA%U+6y@v)@bO%>F
zLA8a`!Z2{gO@!ctH`ztBrn;RYro|mAK8<OF&O?pYAjRWf1F5Dp0$|f({p^OH6!_u!
z@?n+7x9YN&xM!7pD>^GtF^Q<yijHqa$n8(9d#PEcwCLLNOvu5hPKZQ0Bc9rVP!m$Y
zha=VX(t;M|x6d(82jJ!!-wS#*-7Jc~b(&uzZ#~wsw!uI`Fx0Sa*tQOhX_-%X!U@m*
z(YV55)XEoa!?RpR=RL}zbq_ulwj9f=ZG<tH*T|N7Yc#Aj5PZcTrcMmUbo16V-yxH*
zGtDASF|Qjr?0A#~S6Z5(YjOfwiG}zk2q~1WQY@GdDN`VVtd(O@4tq>=DU%C^H^l-L
zg<-%X8#^p$Th!xR2s<XdVtG?>naLE}S<=qGZWAm_$Wj@w3cxbSG{uA_aS;N+)BLOG
zrJ3XrOfNyZClAAyC(jhKSxk-)YZ@mDK5bi*NjC$-Cpn3o%v9bWmy$dS{&b_s)jgFc
z%&=zoT)i<-SkIf<G4YB{3&cmif#o`S(!IU9IZ^G{Rqg1ks#~kQ@#oFKSY%du)`G&(
zn~@ulxq;im^TW4C=SLR@mwGC#eGfy4Crukmw9?d5ZST09o6nW3rS+Be-LuDji?x5g
z_s+hBeRoC{MwT`{>e^H8+Vi+~`*-jEF#hfMy(<p~j#YMC_+{YO(&e9rej0jkcH+^=
z55Hbl9ynIsaiP+y&q;G1lv}n|d$ujw<(__E$LC|!*!tPXzkdZ#To-5*qQ2kOaX1>f
zzgK}Z!*g8+-i!VO`4Na)==@5L`xPwkO4dZa*6ZAnU)M-+`!&F)P`&>fwt?4ewZJRh
zm2+c#@D^%HANWfveX!{S&vO+YUe1BEGV~QfBSL*1nD9+1I=7EYdr>@i+OIC?nGALe
zV!1lhVh&0(;&%!4aDn=bMWCk<iNO+=m}J3|_cj-7<$@6D1;k{7<S2{rmxJmVc>i$5
z!zK&Vob2T4yZFLAv2fnF3hxrQr+l{YisnGx<(tvf4;S0|#-dxT(gQ}+f#o`Sp`flU
zC3A6b;nIVh!{sdpDxD*<vBzzjN*hXeq3cffLif_<yO%2+`zvijvysQG?Q>pfZ2sfL
zq4LH=r8PMls<!o(+xFZ&RBju*x2e40&}{f|Z2jD5CHBtZ)p9KPC^mL?u-rfPAT+iN
zC!#Omfox_W-G6&@okes$P>MDNI;!iJi-yf#CE#MZp0x<|Y|FuptLr2T=7iS-lc-1d
zzTr5o=MNeR1~UIIK=^cI@}%q9WDh6XbjTa%WOT#xh&5UCFesA&?yR3LJcWdptH1Es
zPkHh`Mb+-U>nCQfmZTe>ROG&Aa!a7K<h?+Ep7jO;M}>Pw{y`kq+;9)^APefc>1N;n
z80Ls6@XT6Ht`>E?F;Wu#NJ%tCO5){6NlY3rrg$gSVtO@B7CKPN)<JRzRCrTSUQEv<
jK@gscvM?qrN0AWu6?OfIMqenKg`+}g|KEsXKg|CD?^>8L

literal 0
HcmV?d00001

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bc93b735098c57ffe89fb4a697cd6da16b85db1
GIT binary patch
literal 9406
zcmb7KTWlLwdLG{I;$5UJmPR*A)P<5_%h%Xm+p;5Bwj6Uju~W%A9+4x8Yifp>A!RAq
z-586-F0&{QX`9p(g4POJAX=agD%uC-p@`9<Xn+PNr{fKYS#1&QrVs9uyfzxNMbZ9e
zhC|6T-E5DbGv|K!&*lHV|M02HWyioh+Ww!^--I#jw<w}HQzh{9CImjj2#m)F9igAn
z@jCb#ru3H#ya7jP#wp_^6K_(&rYZ9!3va=(FxH6?=4Onr%v#0_YFoS&N?QpVVJ93>
zBX3LUBhK7E=rHU`E}cygafYS&6n9~XPUq+>$K$`b_W_<r30Xchm&-D^P$?kbbe!h|
z0l&-fcNkt6#5tD1c`lvia#?$Z;j1MF@kNGaaY!1{Dnkmi7Z;-VIG<W#1bmLmF5uY(
z22ZAv^n9ATi$njZ1f5NBtUW_RCe0>rp2?)?l_4R!l4kHlG~B%w8K@vc?e<A_f#FkG
zL1`_)%+pW|Pjg%bpXa$n{C0I>w@>2S62tSU1oVHK5*D~zIzb6pE^{02i_;l=ZpB_5
zZ=B1q*~lQ2OR=eJicV8JBQRMgNee4%97YB)d>+K54oejZjDhEJ$%RY~>e_Epi}cDI
zqt>L-LUs|VKPQ#T#Pus`Q_vNf@NFo}CK#T|(1HM^k%Z0<2{2^)h1V|Qzxm0}@%j11
z43oqcGl%g+ZZQ*yK1Rzt9ns0g*FkRZwVqEy?Kfy1!nW%4<LYiueRC!9^v4kR5Sztd
zdttnu(D8;IjL`GOS<|c;?ZsIOVd#Y~VVt!d))S^??0c|<Dlyn-5VI4OW{h_bR`8vW
z<AQQFC8q&mZiv~Hn1^u8dI{&OPf0ZsE+y4ZH1L6>JK~WW>+i?MUg%zKP{d4W>m%E;
zD;b7j=|x61^Gr6!vvJtbRfpz>w*S*F!21v*u~;>0FA%F7ki8U`r)~rr)2k_U^Yw&5
z%V%{n+HplaXfoZ`T-_GFtp_7v(s~%H%!$?C^bzK$z8i~Jre(tt6PGP>bo>tN71{QB
zZhoHOFR)9pmCY?u47(&eM)e|kz5zbG2ZfEYGt1K{mWp%id@3m$l1x@Mq}Z%%R#PGd
z*(fmS`Mo*<5)Y2!GBZpHaNviTT#{j#C3GN!J1fKEjBp1I&+vIJ!3?W|P!9KSMj22#
zH8;$~;UXBGW7zn@BF*0!MrT&JWQ5`8*E5<~kzFt?s(M1@?)uvy`dq013{iLkynEPJ
zzQ&TTOZ0V>e7&Ntchj`VOTJf1z9XXV$kxik0m(O=x9s>ESF`uuS$pUHyKC=m<TmG|
z&}b<%CWgkgzW1<03XMzt3wdj~rBietFIta3g_g1U8*2e_;C_a?nJ^H>sDagyy4e0L
zh^m5vjZxW7YFzbW)s3^4N|^Sxp0@BfM(SYE&C@(O1w3l$BCM1R36fd_!ONa3!!M>-
z<>FLR0z7k<<`WbT7YetSdRxho4X`hura6~^OP1k-s5w7)uuB?IxE(wJwU6U`2tKk4
zD*PVH@m#59)W~7*U^$wa%Xmk*JMzS0bldZWT{~uTzHfiezIyon$+eUB&#ax<7~AZW
z{6mkd(X!i@xAUlWeE*rWK!S3LmmvVBxW0h(^-me0uRTRbnDqod2CXBPgZ@|pj%k0a
zku-qt4b$jkFp0<gPzFTkP&9ygt1BXujXV=yikK8Z%4XQ0AQah1^GQM0b91te63}wP
zQPO1F@Vr)$dtS2FpD)$}jR+^e1L;~DO4er4+FUsN;Mn@Hjf<P-q?X}F){%01&x3c?
z-}&eVqO~=z&!5e|wc4?It)MH6mfhaGO_i|`+R$9}ru-aIkc|5(IW**upb<_05}{#Z
zC$X98_9hL49!;sP<PC(bR`MW8Bee7Zv^0WYH@k<R?Mlm_Tu1;jN^8igaoXCTwIfX5
zUMdOWOezNk?PX<G%Wh3;P-%uuW*1~L(#4eajYkYTx<}NtG`-F69HlL$Nyx%!k>SH+
z1U`aF4TD#^p57NUz1;V*C9TB-1E@U)ynEQcTQN_tVA}A0WG_2?JIx0QiH))KI}f;Z
zt{6S@&?~i_mzu}(?y{q~a7}WwZH$RsC$?#+>rC0{UG01CYRTClIy)XYyPk{x(%H3Z
z!n(hsD>~Z#j}4HAF?_gxk$?Z2YrjgXvxw-9d~r+6pyRDLQm@^2r18XGU>F=aBOE$(
z$;mFH)3gN?lxr|zRM)F|hsg$5*vgVAizK^K>^vucvP-9#c$P`<=u(o+SvZqSws!TL
z`>~+h^|FQA2N;h)Yr-+`?qNIL!0Pq;Gix*VsWocjjm?-894Q44i^0QN)I(YdPDtL#
zym`m%TRkJWyEc|Y_fXL~^x`r&pf%;L`vdS*#WIDlaB2|vXHBG`-pz$nk0w~wOqvPf
ziMw#dVCxTq5aE<6Cx*4`6LtM|%-RB+s-FJ@T*q+g>X*0FDG$8@8L?_Pgl!1s#9B#f
z%m#C-aX?0ZjvUv&v<D~Qs?N+t+6b3&4?!P9!&M{UCT$JaAw6p+jWHy-8b{i-QLqlu
z5p&gKLzJw}fOXEewS1762kqZ#Z5QDsU4#!VDUc}J0LmPpsT$|UCjJR#>Ynjw8THnR
zbno4ZtOsKL>Wl&~qaf*4X5=MJF?1W%xERvOHI8%+>xQvm9qAg@5sgGh`3oPnu)dil
zttjaunlw3TH+ur4NcyfekiHmFxiyZ^AqlYlnFCq|=_i_rkVYrjzzo6)iX#J>q(BCM
zCfFe9k9BCddtcH^28kB5+E?JvI?%qtVqJUrT8Q+K{+nH-hqR7W&@|f!qwLZ0_rCk|
zLPHEKPmLoRwUI%eVx`XpxL8_q{{}^`?on67Q}Lw~a~Bv`I*s22E}3E#eg%}0!)aWA
zDA3PDiU+oNE|*FtfC>Y-WQ2uE-cW`8f!fCf4zhrj;J_$>mN0lqzy;>59K#|OX-6av
zbPA990Hu+ED+-7vBF`uh8qVS)3IpF`eig<ygqS1rV85H<(Ex$s<raa-A{IWEVo^;R
z&!(VfG@%R+*joU8tYF4zxuL=<HP&EYl4;qBz?;IMe3fjN6wqgZg2~Dr1!V3|(M4qg
z@XqTIy==+QP$ers8_QrTptw6IDSn4eg0@s(|J&{=4~VJ>p}hp?IK^}9B5ES4YH8~N
zLnqQOX4z30jDl}%LJIDGMGIp*(v>_qt*Ra{t20o5mL1LI4glYzut#BJ2htuDgN1CL
zqqFe^N=UuUfN}wvq)?+@=wu73n@TK40<tZ&nBn*=kQdoj9j<~5cmwn+n-=dN*|^kV
z2yo}pjO;)S#CZnUPh?{v70=4f$|TUv5@av+#^i+?)b!aa7pUu3smbx13QkmtFVc4y
z*|I<jbT-TL1nR{CS_;Kv6M%S-s0WM<)Dl<$I7$F=m0FganxO!-FPq|NP5>s%EN6id
zMnWp2N9SEOXSj4KzM^6m#FY3O$Wxf%Ii)W!-=Ov8CsE9y&Pd%AicBk0w4z}maw&T&
zGedJ$1uom*<bcT{yCBF$Wk6QsBazAuC8Um;M;9^wD(c4y)RW;^Q1Cj~FiKScU2zB_
z*KsN(sZJVMM_#U|E8|0jP+!77>X4#-+JM=berpW_PrTo^)>mv9*d!(2p}eK+@U1eE
zqowQ*<|n`MhYEJl-<7xS_z+cGURy5Who#V1DKxts_$>TsxOncmbex1}@y%JucdK}t
z5q%7_Vs*dode2ozY+ld1MC&V+ZVJaYl9De9`2(+PC8dE=Vq<Ud!ff$YqBzSG&(G%_
z<zQPWI4A}OH)o{a@w~n4KTz^VM1N$nPx2qiTfc6_Hv${X$1PGTAcv;%z!><2A<^3R
zRiH6%D+j`*K+mH<PkwwS)K>}(ilM>H>szBR<orY#@7{Ds_@VXA?a}S)+ZVT+^HX2>
zT1vi7(buW+<g&ME)n0CHEd<KYeoM3*?kWRyYduhIY29^W{=Qwz>Tk%Of9AtF`Zf)l
zZ<O%EB7S&{*zWqQ@6*1Aqn{4sFYbie3QHSrS9;wEHW!ZlF}12Idm2lgUeVLL>Dazj
z^z=%eGfxayW6!P`3pT;1#X!$SeDlcGk%!^Wk4ga|f1&Jjzd!lj<R4Czoxzf`Rdlu%
z$S<8;Pa;_G;BG(WX)0R73U=Z-w3QaXi&tSJL{J!=)2BZI{h^>J3{aE>P!t9z3K>GN
zDLrcgHlpws!l39nJMhAazJa<`7qeD_)j&l%I%cY<12mblnq3XVs+gjh1U9rif?qe^
zM3^;Qt!XdT2?(cNTUGqB53#i&1h#KvM#75bSGPWxgJ0;&HB&`5u_~6?8-FLJX<fEq
zU&%UXr4u8oh!V%zkhZUbUvB=@%|9Qja8VY}5uhRu={x~Fzow)A8!(D4R>d)UqtM2X
z0~Eu0Ss#>Y*~`^JTB+x8OjW7@(n)%Tzzk0Ykk1D9Fb!cQgPLSWeYz?U(yieq`XJPE
z6AneTU0RQOU#)LIIc~!F?WJaVw7R5A8=)U$FmVY+<^%j(2j6IY=n<B(#t<{Daimvk
zD~8l&joXKi8v)}F)nr4IY$V;9928lHU@W#+^u>~~(HGOm5aA*kj_Y%98rd+8<`Iq^
zda;}~k9wxFYMvI@R~=Rzk+2$9@5E|SV8aAFrw~=^E}WF(TJ;+tkRh_H)VLVZA2m*s
zNp&}z1X)>PXKIQ;^hWq4qx{Z`WnWSfY3LeP#o?&+3pJS#y%E-CoP({$dU4nxNdEhF
zSuJXBBK5J?UTleMB29#QKWcwXcjbzTrC-t-oLv^RhOk79Bbx}%aUGy>xGxk2;DNIT
zfMD;`)38^d2RQlPdb>1ebA(l#(mEo%&2aC;V1%`f@M-1!)jLc}>Bc&+K}=iCgbu^z
z^*Gk9NbNP<wEvW$9h=v6{J+LDkEyjH{%Hk&%O=J6E}IizK94gzG9B<}_bE75_Go~Y
zUIwU(EKP`EDVC<kGbl8IJao#g!;_9H!KgN0z}=}h*l$V5kZlUiLXeri4)N!V)^Jl|
z=<Gf12CoZdhF(N~v4PIxyi3@JCRi2Fb!b(R*ipm0E~1<M1z3h<eRM?DWu7w&+g?SF
z0LvNvw0z`cmXdwYH~MOti^D_R>1egyIt)TU>g6$@vSRS)KX)7{oc(}?&wmI=8~muj
zqkG%b9|fin>6J}@;Nfw=1W-A?pkP<o%gw=i#FA?A7Jzji1Kk1*v<Io8J<~QAUj^9=
zP&$4$0T6jXAvKCgRe6l7unLp%lqcJuoSMR;ivWlYy)3);7z2ga5dRA5U=HP5SdL2a
zG`zpi0tKjB0OYUFa<UWN1Qqi;Di54!MH#s466nQDnxWEM5?)VaLp+m{Er>$Guw^G5
zNAI4LIv0Bi7$-0u7Ut!uKEy&kOlF$n7ok~WJe|sb)r-3;8y8Z^1u#&<wm~mp=<!f?
zDs+H?Lcp(NGYkz(V6rip=H?>J`?!H!wTB^8&nMB$kQ&)TR<fSW@IOXL3T0SE)3zf*
z1tbr6o3epr8^}rBw6aa0mzV?v(&o`_ll@h*9aW*DV2JBS?IDE(#kH4Z*@;?($0<eJ
zvW;gJ;i(Epu59GyD40P-YGzeVJP*|cWV)!s6e>vyVc-*}pO+f+_H9#yCWXI*b8-*+
zm9_a-)&L?3-r#C-?Lc8!^!DY=<zP!GI3xy#O2I>7@X*#-F?b|z-wC%A#ve?rPd&K0
zes%NW);X#5XsPw2*m`pN>gU&_)~izZ^}Gu(`u&NuiQ<7_DKJtBP=9@X+qg}B>iEq4
zsk=DwW^v}0bnLs*VJdI?I?#B3^5>I(G?lmQSncmye`Z|`J+e0K>RW8Mg7hO|AhOxN
zJud~O04;ZR6`k#6U(crNk#DRVZhPQdcYf5cW?I#+o-Ma_mRbkJ)<LN?x@y~L@7y^4
z(GORh<<>}X@Qm1c<{{)wL&ng^mUnAX8almdf8^=^7k{`MXe#urT`dJrUylL<weg*%
zw!%uODFR(IwLLhne&A2q%kA9{-d%qeUfkMWDYcJ@?PFW>Qv0cbvE1EX>OLxVAB71s
zQup|}tzh2Jm3v1@y~o7fW2N4UV(-O=OT}AsvG-!JcW&K@@;64y9lf74f86|Mt)-4b
zV#gt=W31G1O6)kbJzqL~O+0<Aboz#P`i9hTvta(lhBZf?*|Biv#=*^=QvWHj|5T}e
zT<jk&g)WGp3+0ZkPeLDuHXFAZOQYlB=y+*#LL8m={9x(QE%DN=(xo}^(wx*0FPO_c
z1E0)%JhRm(^&DH9UOk6iO$zjecjrLo#;ac*7%Aw=;m%SxB8DSd9ox&ra6}4EJTqWz
zqfh);bN{nOthv4L{V$saK{A21QeZ#~3~Ze(1_q?S5#T+|LtlBiHoh-<4i!CJyF-}&
z;HK+K-`KMe%;kQ6;%5`9Z$5Iif9(#6!J`|YPg*~2-Aq2}I3fj)O75dY>(OsIV4i>4
z{$I~ptL1;`Z2wOIfy2Kuor_*Ri2ePcbHU32)92RnK8Sr0uwU-7ebHt`;e+AJrw#lM
zq3m=dC|fCo;8T=rrz$oK2s<h2tsI?31|f*KDQZ5&3t$Ri8Mx~x9&j?oqYGL#z>h;v
zKFzWm*eTG}1o?`l3Zp_D$Qwi+LT<8gj^on&DJ7UjKP<_jEak2SjgP6PG)O78Zz@h9
z!-w+I6Vxagu}b(5f#5Bghn_<PbZ7k(E4L5do6Ij44DbCwG7j$=jiwWY>@y5LyH1PA
zkHYZzU8B+D`<>Hf>fUYCo1)v}&oKDxI!8@STM&lN?!2zwbnKz9i@~QzzNvn{JNFHW
zt7A~QM_oTgD)liMJv>Grz#$vvm?hZ(@8JpVF5Kj)MR=iSmMAccMC`JKq7qyjrt?!&
z6bxnA1+Xxw79}(Mk^!zdRkD$)%}KSHMXPV;QKV1QxAW*64f+qjT}2+ZQ0RF<$<|j@
zyUl6-Zy{F){{mWoU4u@id!o1K4m|Z?I{UA&@UO9!f5px`Gf(Js#|m#g!_as4rV;b`
W%MNGRA1J#$Pi&^3Uc|hL4E`5tJ&=$9

literal 0
HcmV?d00001

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..474555509212136bcdfd777fcb79cb65cbb4d07f
GIT binary patch
literal 2709
zcma)8OKcm*8J^|7Na|V2acW0xSF$Q;VTp_c(?MQ!Q#nQ~C34IS?KT#&STBd-R?Ast
zW>&TW1n@2LDK_B1I{MIqdP{%;=`lqwO;Mm?89>6ohqOge_*4gqQRI~Vvm|Ai%^?F~
zXa1T0@y+}%?hg!P5sb0leH;A8K<J;6=nlQJ*na^QA0Y=(<S0&RNm)$MlqJ{dlDeo-
zZBeJXg?@;}k)s_)j=rK7)wV1(Rt?*Xf36^O`GWJx!mI0E7<)XT)<16Cu`YWX^~k!=
zz{GNby7kH`CY)IdH7^V?S;ZE`UKp^tg+VHx&0eNngIT<WEqpy-JRqx9k5WJM7_$Om
zUB$$&)jhgakg|!zd}AXq%4;6aR)Y{*(KXUm*R@mkc+Hw_U^-jH9*-%u!e9-DK`n|Z
zR!l0GS|Ro*!Idlq`@kMiW?LSqB&ehy^D)6zJq~#=OQSH1Vs0@`V_&MvTW{3@W_cB_
z!7&98aNB?bULGY1ViHsXT(KU%5*%S#^?aPqK9DbJD<ZuhFL}wMc1mO=X9+VfCo%_8
ze*5j)-;N{D{yCsOLMw<W;K-DtQ1z$CNl|S@UooVoS4>Bpg4xkl(x+06ejNQ;qZy#G
zKp6?u=a{sAHEn0a;FAykU=Rd_el232FEV_ifnDO&u`noxdlMhre=s1*kOZ|K!Uv)X
zBFa%6&CwkruM?%Lln!%u#LB3Y>MRdUrEY{C+H=fjQL)2Y>}ax3S(!j~dRYwk5#it$
zw+(gJ2iJ4#LcqE3l0oHxl%P`loUM!gE_XeuV37$(z}@yzMpb(`3=XOX#N$Ck+B-)m
zzUI-2OCc_zdhlLyBvee+ZB-a?qXHI%0jjIPs!&%k2S;iQhgJF%*k#hVWr>wSnPJKb
zIEJji>(Vw`+bEpHY>h{a!U7;Fyct)kn4YD<IxaMlbr@VNL_YRN;X5*b6_Q&+B8UM%
z<3{V4(x8Hw7=6;6`6o_)AW_K@a07igdUEd={EqKgV|x?!SAE7%c2j*gh%)^jW^ZP1
zox1zt?HBK!xqaqdaeKTqGW$h3zc-ZI%<h9U@-_MGj10_46d<^7I|`Q2qCzzAl{Svb
zYS(u}FYDjs=U>j1Q{5^<jIvQOyQ&}0GOnIiO43%(D0XOKUg_%|l?_Li0aY64?mV8m
z{9ROK(=lYS9F{K0B<L9>`FK5}tE-gjIRMg8B!WX$%w8)_I>9JIrDt^Y9+J*T>9SdR
zx`!3WF82t!l)jW<>a!4eBj~jWgktbW<7PR1<ycp$oN=<<oSN=zdhl+CZlCE|y`Ypc
zZt9Img!+($h)L2UbLE)>O=bZ(eb1*Rz?y}aN-`u}MRxykl5@5~r5E3VD%VfqI;Mf|
zMkL%2M}Ts@q&>J$W!<F8ieVmcNW;fx;%IvZ>gc;MPPjZ8L`>v572d65?!kUUu{iqw
zdoJS@p#>b*Atg=V1prJKRSLC?*@NQ5L6YT@B+DNpSzZ=|q|GEZ5a|w$g;s}b6ef?{
zO5k&$SHsBTLS=DX=mE5p>rfIBxI$+&uYutRK<E+#6K2h09_N(G)UmTvdPErO7@Eww
zFd9)9_!~m==qeL^Zj!_23BgoqmQ$)y6(&vE0~5-cP}Z63_SV6m$E8BE)2PJJo-)wc
z9|b1%FZkR*kJ2b}N%{NmGh1g{!(*HJ-azidcW%D3b#{BQH88iC`Z9Oq?$qt6d;Hea
zPHtv5cdD5?)ykdTH1AKI-2TN*;nn8Mx!sw?=FDPi=JnR(8=L0N@OU$Q;{MFs$K#()
ze=_}P;gdpZeqne1d~^Q%pT?T=ueW9ruIXmlPPlf8aJ0BPzucT(Zq2{hnz<;I$m6}m
zi+`Q{eD*JeFLH1Hx$uXH&&F=e{pPvtx$WxiiDL6a@lL#T@y_+%&;4!WrOykSW;6Hp
z{h?#`ram5T4i$IO#c%diG;&2@vO)bZb#mba<F{zp9uR3aX=bh~vaYP~afo5v@4D~C
zUMSlfP(!X;4JhMbKro41m&$jhvcO4yQhDwX0^us#1Oow0Mgx_3k@%LrELop&Ap(KL
zccs&Vt1-vGOV3EIAIXn&R}x^!N7frA^YH_;_mjyR=Qpo!sW;zmX_F7Nj6Sl(zeX@U
zJUXbK-hS`D2&T3YnPSVfDNNU`L_T0Kpkc$GsWx2ud&D@)U-h6l$)KdN8nq!k2~(tB
uYQNRb=n}BVzRG?KpNFcVC|{)vMSC=il+k~qrLT=q<@BvzJwh-g2>%CLZOl{v

literal 0
HcmV?d00001

diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py
index 066c1919d..77a7ad95e 100644
--- a/pufferlib/ocean/benchmark/evaluators/base.py
+++ b/pufferlib/ocean/benchmark/evaluators/base.py
@@ -1,7 +1,6 @@
-"""Evaluator base class + EvalResult dataclass."""
+"""Evaluator base class + default rollout loop + EvalResult dataclass."""
 
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import ClassVar
 
 
@@ -14,9 +13,11 @@ class EvalResult:
 class Evaluator:
     """Base class for all evaluators.
 
-    Subclasses set `type_name` (the value used in `[eval.<name>].type`) and
-    implement `rollout()`. Optionally override `env_overrides()`,
-    `vec_overrides()`, and `aggregate()`.
+    Subclasses typically override only `_should_stop` (the loop termination
+    condition) and `env_overrides`. The default `rollout` runs a step loop
+    suitable for "stream of episode infos until target count reached" evals.
+
+    To diverge from the default loop entirely, override `rollout` directly.
     """
 
     type_name: ClassVar[str] = ""
@@ -39,6 +40,8 @@ def __init__(self, name: str, config: dict, train_config: dict):
         self.render_views: list = list(config.get("render_views", ["sim_state"]))
         self.clean: bool = bool(config.get("clean", True))
 
+    # -- Config hooks ---------------------------------------------------
+
     def env_overrides(self) -> dict:
         """Per-evaluator [env] overrides. Defaults to whatever the section
         wrote under `env.*`. Subclasses can override to add baseline knobs."""
@@ -53,24 +56,104 @@ def vec_overrides(self) -> dict:
         base.update(self.config.get("vec", {}))
         return base
 
+    # -- Rollout (default) ----------------------------------------------
+
     def rollout(self, vecenv, policy, args) -> EvalResult:
-        raise NotImplementedError
+        """Default rollout: reset → step → collect infos → aggregate.
 
-    def aggregate(self, per_rollout: list) -> dict:
-        """Reduce a list of per-rollout dicts to a single metrics dict.
+        Subclasses tune behavior via the hooks below. Override this
+        method directly only if the loop shape itself needs to differ
+        (e.g. per-scene multi-rollout patterns).
+        """
+        metrics = self._run_rollout_loop(vecenv, policy, args)
+        frames = self._render_pass(vecenv, policy, args) if self.render else []
+        return EvalResult(metrics=metrics, frames=frames)
 
-        Default: numeric mean over keys present in any sub-dict. WOSAC
-        overrides for likelihood-style aggregation."""
+    def _run_rollout_loop(self, vecenv, policy, args) -> dict:
         import numpy as np
+        import torch
+
+        import pufferlib
+
+        device = args["train"]["device"]
+        num_agents = vecenv.observation_space.shape[0]
+        state = self._init_lstm_state(num_agents, policy, device, args)
+
+        obs = self._initial_reset(vecenv, args)
+
+        infos_collected: list = []
+        steps = 0
+        while not self._should_stop(args, infos_collected, steps):
+            self._maybe_reset_lstm(state, steps, args)
+
+            with torch.no_grad():
+                ob_t = torch.as_tensor(obs).to(device)
+                logits, _ = policy.forward_eval(ob_t, state)
+                action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
+                action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+            if isinstance(logits, torch.distributions.Normal):
+                action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
+
+            obs, _, _, _, infos = vecenv.step(action)
+            infos_collected.extend(self._flatten_infos(infos))
+            steps += 1
+
+        return self._aggregate_infos(infos_collected)
+
+    # -- Loop hooks (subclass-overridable) ------------------------------
 
-        if not per_rollout:
+    def _initial_reset(self, vecenv, args):
+        """Return the initial observation. Default: synchronous reset."""
+        obs, _ = vecenv.reset()
+        return obs
+
+    def _init_lstm_state(self, num_agents, policy, device, args) -> dict:
+        if not args["train"].get("use_rnn"):
             return {}
-        keys = set()
-        for r in per_rollout:
-            keys.update(r.keys())
-        out = {}
+        import torch
+
+        return dict(
+            lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
+            lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
+        )
+
+    def _maybe_reset_lstm(self, state, steps, args):
+        """Hook for resetting LSTM state mid-rollout. Default: no-op."""
+        pass
+
+    def _should_stop(self, args, infos_collected, steps) -> bool:
+        """Loop termination. Subclasses must override."""
+        raise NotImplementedError
+
+    def _flatten_infos(self, infos) -> list:
+        """Pufferlib backends return either a list-of-list (multi-worker) or
+        a single list (PufferEnv backend). Flatten to a list of dicts."""
+        out = []
+        if not infos:
+            return out
+        for sub in infos:
+            if not sub:
+                continue
+            if isinstance(sub, list):
+                out.extend(sub)
+            else:
+                out.append(sub)
+        return out
+
+    def _aggregate_infos(self, infos: list) -> dict:
+        """Default: numeric mean per key, plus a num_scenarios_completed count."""
+        if not infos:
+            return {"num_scenarios_completed": 0}
+        import numpy as np
+
+        out = {"num_scenarios_completed": float(len(infos))}
+        keys = set().union(*(d.keys() for d in infos))
         for k in keys:
-            vals = [r[k] for r in per_rollout if k in r and isinstance(r[k], (int, float))]
+            vals = [d[k] for d in infos if isinstance(d.get(k), (int, float))]
             if vals:
                 out[k] = float(np.mean(vals))
         return out
+
+    def _render_pass(self, vecenv, policy, args) -> list:
+        """Render hook. Subclasses that support frame capture override this."""
+        return []
diff --git a/pufferlib/ocean/benchmark/evaluators/behavior_class.py b/pufferlib/ocean/benchmark/evaluators/behavior_class.py
index 2d3a7f45c..ef8425fa7 100644
--- a/pufferlib/ocean/benchmark/evaluators/behavior_class.py
+++ b/pufferlib/ocean/benchmark/evaluators/behavior_class.py
@@ -1,8 +1,7 @@
 """BehaviorClassEvaluator — one nuPlan behavior category at a time.
 
-Runs a HumanReplayEvaluator-style rollout against a single map_dir, with
-optional fresh random sampling each pass when `num_scenarios` < total bins.
-"""
+Inherits HumanReplayEvaluator's loop. Adds optional fresh random sampling
+each pass when `num_scenarios` < total bins (via a tmp symlink dir)."""
 
 import os
 import random
@@ -10,7 +9,6 @@
 import tempfile
 from typing import ClassVar
 
-from pufferlib.ocean.benchmark.evaluators.base import EvalResult
 from pufferlib.ocean.benchmark.evaluators.human_replay import HumanReplayEvaluator
 
 
@@ -22,9 +20,6 @@ def __init__(self, name, config, train_config):
         self._sampled_dir = None  # tmp symlink dir created per pass
 
     def env_overrides(self) -> dict:
-        # Reuse HumanReplay's defaults, then handle the random-sampling
-        # cap. If num_scenarios is smaller than total bins, build a tmp
-        # symlink dir with a fresh sample each pass and point map_dir there.
         env = super().env_overrides()
         map_dir = env.get("map_dir", "")
         if not map_dir or not os.path.isdir(map_dir):
@@ -45,12 +40,6 @@ def env_overrides(self) -> dict:
             env["num_maps"] = len(all_bins)
         return env
 
-    def rollout(self, vecenv, policy, args) -> EvalResult:
-        result = super().rollout(vecenv, policy, args)
-        # Manager owns the cleanup window — defer rmtree until after vecenv.close
-        # so any open file descriptors on the symlinks are released first.
-        return result
-
     def cleanup(self):
         if self._sampled_dir and os.path.isdir(self._sampled_dir):
             shutil.rmtree(self._sampled_dir, ignore_errors=True)
diff --git a/pufferlib/ocean/benchmark/evaluators/human_replay.py b/pufferlib/ocean/benchmark/evaluators/human_replay.py
index 28e19e22b..10f06b277 100644
--- a/pufferlib/ocean/benchmark/evaluators/human_replay.py
+++ b/pufferlib/ocean/benchmark/evaluators/human_replay.py
@@ -1,14 +1,13 @@
 """HumanReplayEvaluator — replay mode + control_sdc_only, one rollout per
-bin in the map_dir, mean of per-episode info dicts."""
+bin in the map_dir, mean of per-episode info dicts.
+
+Inherits the default rollout loop from `Evaluator`; only overrides
+`_should_stop` to terminate once every bin has produced one info."""
 
 import os
 from typing import ClassVar
 
-import numpy as np
-import torch
-
-import pufferlib
-from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+from pufferlib.ocean.benchmark.evaluators.base import Evaluator
 
 
 class HumanReplayEvaluator(Evaluator):
@@ -32,48 +31,11 @@ def env_overrides(self) -> dict:
                 env["num_maps"] = env["num_agents"]
         return env
 
-    def rollout(self, vecenv, policy, args) -> EvalResult:
-        device = args["train"]["device"]
+    def _should_stop(self, args, infos_collected, steps) -> bool:
+        # Stop once every bin has yielded one info, OR after a step budget
+        # generous enough to give every bin a chance (env auto-resamples).
         scenario_length = int(args["env"]["scenario_length"])
         init_steps = int(args["env"].get("init_steps", 0))
         num_maps = int(args["env"]["num_maps"])
-        num_agents = vecenv.observation_space.shape[0]
-
-        # +1 step margin: env emits done on the step after scenario_length.
-        total_steps = (scenario_length - init_steps + 1) * num_maps
-
-        obs, _ = vecenv.reset()
-        state = {}
-        if args["train"]["use_rnn"]:
-            state = dict(
-                lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-                lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
-            )
-
-        all_infos = []
-        for _ in range(total_steps):
-            with torch.no_grad():
-                ob_t = torch.as_tensor(obs).to(device)
-                logits, _ = policy.forward_eval(ob_t, state)
-                action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
-                action_np = action.cpu().numpy().reshape(vecenv.action_space.shape)
-            if isinstance(logits, torch.distributions.Normal):
-                action_np = np.clip(action_np, vecenv.action_space.low, vecenv.action_space.high)
-            obs, _, _, _, info_list = vecenv.step(action_np)
-            if info_list:
-                all_infos.extend(info_list)
-            # Stop once every bin has yielded one info to avoid double-counting
-            # on the second cycle through the dir.
-            if len(all_infos) >= num_maps:
-                break
-
-        if not all_infos:
-            return EvalResult(metrics={"num_scenarios_completed": 0})
-
-        metrics = {"num_scenarios_completed": float(len(all_infos))}
-        keys = set().union(*(d.keys() for d in all_infos))
-        for k in keys:
-            vals = [d[k] for d in all_infos if isinstance(d.get(k), (int, float))]
-            if vals:
-                metrics[k] = float(np.mean(vals))
-        return EvalResult(metrics=metrics, frames=[])
+        max_steps = (scenario_length - init_steps + 1) * num_maps
+        return len(infos_collected) >= num_maps or steps >= max_steps
diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index 365505bfd..0703e6567 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -1,21 +1,21 @@
 """MultiScenarioEvaluator — distribute scenarios across workers, one rollout
-per scenario, mean per-scenario metrics."""
+per scenario, mean per-scenario metrics. Drives both the gigaflow validation
+path and replay-style multi-scenario evals.
+
+Inherits the default loop from `Evaluator`; overrides `_should_stop` (cap by
+scenario count), `_initial_reset` (async reset for multi-worker throughput),
+`_maybe_reset_lstm` (per-scenario LSTM reset), and `_render_pass` (the C-side
+EGL → ffmpeg mp4 dump)."""
 
-import contextlib
 import os
-import time
 from pathlib import Path
+from typing import ClassVar
 
-import numpy as np
-import torch
-import tqdm
-
-import pufferlib
-from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+from pufferlib.ocean.benchmark.evaluators.base import Evaluator
 
 
 class MultiScenarioEvaluator(Evaluator):
-    type_name = "multi_scenario"
+    type_name: ClassVar[str] = "multi_scenario"
 
     def vec_overrides(self) -> dict:
         # Multi-worker by default for throughput. Override via [eval.<name>.vec].
@@ -24,8 +24,6 @@ def vec_overrides(self) -> dict:
         return {"backend": backend, "num_envs": num_envs}
 
     def env_overrides(self) -> dict:
-        # Sensible defaults for the gigaflow path; replay configs are expected
-        # to set the relevant knobs in [eval.<name>.env.*].
         env = {
             "eval_mode": 1,
             "termination_mode": 0,
@@ -34,94 +32,50 @@ def env_overrides(self) -> dict:
         env.update(self.config.get("env", {}))
         return env
 
-    def rollout(self, vecenv, policy, args) -> EvalResult:
-        t0 = time.time()
-        num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1))
-        scenario_length = int(args["env"].get("scenario_length", 91))
-        device = args["train"]["device"]
-        num_agents = vecenv.observation_space.shape[0]
-
-        global_infos = {}
-
-        # LSTM hidden state shared across the rollout; reset each scenario batch.
-        state = {}
-        if args["train"]["use_rnn"]:
-            state = dict(
-                lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-                lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
-            )
+    # -- Loop hooks --
 
+    def _initial_reset(self, vecenv, args):
+        # Multi-worker async reset gives us the parallel-throughput path.
         vecenv.async_reset(args.get("seed", 42))
-        ob, _, _, _, infos, _, _ = vecenv.recv()
-        scenarios_processed = 0
-        with tqdm.tqdm(total=num_scenarios, desc=f"[{self.name}] scenarios", disable=args.get("quiet", False)) as pbar:
-            while scenarios_processed < num_scenarios:
-                if args["train"]["use_rnn"]:
-                    state["lstm_h"].zero_()
-                    state["lstm_c"].zero_()
-
-                for _ in range(scenario_length):
-                    with torch.no_grad():
-                        ob_t = torch.as_tensor(ob).to(device)
-                        logits, _ = policy.forward_eval(ob_t, state)
-                        action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
-                        action = action.cpu().numpy().reshape(vecenv.action_space.shape)
-                    if isinstance(logits, torch.distributions.Normal):
-                        action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
-
-                    ob, _, _, _, infos = vecenv.step(action)
-
-                    if infos and infos[0]:
-                        for sub_env in infos:
-                            for env_idx, summary in enumerate(sub_env):
-                                map_name = summary["map_name"].split("/")[-1].split(".")[0]
-                                summary["episode_id"] = env_idx
-                                summary["map_name"] = map_name
-                                scenarios_processed += 1
-                                pbar.update(1)
-                                for k, v in summary.items():
-                                    global_infos.setdefault(k, []).append(v)
-
-        metrics = self._average(global_infos)
-        if not args.get("quiet", False):
-            print(f"[{self.name}] {scenarios_processed} scenarios in {time.time() - t0:.1f}s")
-
-        frames = []
-        if self.render:
-            frames = self._render_pass(vecenv, policy, args)
-
-        return EvalResult(metrics=metrics, frames=frames)
-
-    def _average(self, global_infos: dict) -> dict:
-        out = {}
-        import numbers
-
-        for k, vs in global_infos.items():
-            if k == "num_scenarios":
-                out[k] = float(np.sum(vs))
-            elif vs and isinstance(vs[0], numbers.Number):
-                out[k] = float(np.mean(vs))
-        return out
+        ob, _, _, _, _, _, _ = vecenv.recv()
+        return ob
+
+    def _maybe_reset_lstm(self, state, steps, args):
+        # Reset between scenarios — gigaflow's auto-resample fires at the
+        # end of scenario_length, so steps % scenario_length == 0 is the
+        # natural boundary. No-op when LSTM is unused.
+        if not state or steps == 0:
+            return
+        scenario_length = int(args["env"].get("scenario_length", 0))
+        if scenario_length > 0 and steps % scenario_length == 0:
+            state["lstm_h"].zero_()
+            state["lstm_c"].zero_()
+
+    def _should_stop(self, args, infos_collected, steps) -> bool:
+        target = int(self.config.get("eval", {}).get("num_scenarios", 1))
+        return len(infos_collected) >= target
+
+    # -- Render --
 
     def _render_pass(self, vecenv, policy, args) -> list:
         """One rollout per view, all writing mp4s to a single dir.
 
-        Re-uses the same vecenv if it's a single-worker setup; otherwise
-        delegates to a serial render env built fresh per view.
+        Builds a fresh single-worker env so frame capture is sequential
+        and starting_map_counter starts at 0 — the C-side ffmpeg-per-env
+        wiring assumes one bin at a time per process.
         """
         import importlib
 
-        env_name = args["env_name"]
+        import pufferlib
+
         backend = args.get("render_backend", "egl")
         if backend != "egl":
             return []
 
+        env_name = args["env_name"]
         out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4"
         out_dir.mkdir(parents=True, exist_ok=True)
 
-        # Render with a fresh single-worker env so frame capture is sequential
-        # and starting_map_counter starts at 0. Multi-worker render doesn't
-        # match the C-side ffmpeg-per-env wiring cleanly.
         package = args.get("package", "ocean")
         module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}"
         env_module = importlib.import_module(module_name)
@@ -155,6 +109,11 @@ def _render_pass(self, vecenv, policy, args) -> list:
         return all_paths
 
     def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: Path) -> list:
+        import numpy as np
+        import torch
+
+        import pufferlib
+
         device = args["train"]["device"]
         num_agents = vecenv.observation_space.shape[0]
         num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1))
@@ -163,12 +122,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
         saved_cwd = os.getcwd()
         os.chdir(out_dir)
         try:
-            state = {}
-            if args["train"]["use_rnn"]:
-                state = dict(
-                    lstm_h=torch.zeros(num_agents, policy.hidden_size, device=device),
-                    lstm_c=torch.zeros(num_agents, policy.hidden_size, device=device),
-                )
+            state = self._init_lstm_state(num_agents, policy, device, args)
             scenarios_processed = 0
             while scenarios_processed < num_scenarios:
                 ob, _ = vecenv.reset()
@@ -176,7 +130,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
                 num_in_batch = len(scenarios)
                 remaining = num_scenarios - scenarios_processed - num_in_batch
                 target_env.batch_size_eval = max(1, remaining)
-                if args["train"]["use_rnn"]:
+                if state:
                     state["lstm_h"].zero_()
                     state["lstm_c"].zero_()
                 for _ in range(max_steps):

From aeb9dd565a2ff7ca1d5e38fa1602fc68537d0f48 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sat, 9 May 2026 18:47:31 -0400
Subject: [PATCH 08/26] [WIP] eval: untrack accidentally-committed pycache;
 harden gitignore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The benchmark/** unignore was over-broad — it pulled in pycache files
that should never be committed. Add an explicit re-ignore for
__pycache__ under that path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                                       |   2 ++
 .../__pycache__/__init__.cpython-313.pyc         | Bin 1183 -> 0 bytes
 .../evaluators/__pycache__/base.cpython-313.pyc  | Bin 9080 -> 0 bytes
 .../__pycache__/behavior_class.cpython-313.pyc   | Bin 3386 -> 0 bytes
 .../__pycache__/human_replay.cpython-313.pyc     | Bin 2580 -> 0 bytes
 .../__pycache__/multi_scenario.cpython-313.pyc   | Bin 9406 -> 0 bytes
 .../evaluators/__pycache__/wosac.cpython-313.pyc | Bin 2709 -> 0 bytes
 7 files changed, 2 insertions(+)
 delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc
 delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc
 delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc
 delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc
 delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc
 delete mode 100644 pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc

diff --git a/.gitignore b/.gitignore
index 782cfdf36..430b374ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -150,6 +150,8 @@ experiments/
 benchmark*/
 !pufferlib/ocean/benchmark/
 !pufferlib/ocean/benchmark/**
+# But re-ignore caches inside it
+pufferlib/ocean/benchmark/**/__pycache__/
 wandb/
 .neptune/
 raylib*/
diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/__init__.cpython-313.pyc
deleted file mode 100644
index e4d8790db2fa093442f5ecb0224609017549d6ba..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1183
zcma)5&2AGh5ccM$n{1jKswyCH@v%X$E5r$@Dk^ORBrPr3wg?e$on0qcW!JkJ@22XJ
zhu|SN@eVuz2M*OrJ#pgJ(gR1Ztu{$-^<gw)kLUa5^Z2S>w~$=lzx@k-mJs@-f}2&o
zcCJ22=L>QXAlGmUU87TgLZ=8ttm0y~)G0$*`=#zu#{^UR<!+@@g{t<Kx>lzKHH`Yz
zez|Y<Yp!XRVA(-#<*fnrQMF~!=LSNDJQxPd$81W&jHWT*A<&4O#4y1_8SQ7oA%krQ
zrp&giHuc81JxdkYzzeAmI6mRhaE2ic!#GQELerE1xADQ4=?lA*(-8x_mQY|kwG!%0
z<VFkof$-uf12Lx{B!@V{#~&0ad!0)@8y}CcU|t%;5-8x<*MtHdSd{y!h6MD+OyG3P
zBAE~dlFXW)LBK~i1u8T7sD))iRA5Ovf!Je0Fd>_A8N+_;2}i+^ObOyqg=9y*)<D+8
z!z>K3&qOfd81yL(r6Rx8gS85|r9|m5k%ehqoe9)3^5zRRrqdvXEhW_A$?B^tqP)kF
zke)8?H+N-`!1WlX5X6fojdy$Q=GJ_tls6)kOT=|@-WXh?5&bUt@>pRI&>(sFBo@?b
znfXflXmjUq^I)$>dhM65-GknHD9T1a&LaX-88wNpq)bJcH?2l)5+YS1ByS~pCc<D~
zOA#rz2aJ2;h{D8XH=`l!0TpasG=F-xOSkB268V!m-CS%`^S&eEZ2@)f`NKOQYB9As
zP?r)ZeP|dG@)cdl5aq-<lnThH%0`T{shq%6OirC`CMIc|I9svLoc;Ola%P~8R@IrQ
zs^?PwP2mut>JVZlr+H;Piv28P8*pFZ$-j)mGiff0hGAS5Or!L71sOZWIXXH=J#E_;
i7P|NNtoi6{b?vPF;Idk77Ji_WKaEPQ@Y#f>y!{_QRD>%4

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/base.cpython-313.pyc
deleted file mode 100644
index 5f0d416d296e9d3d20ebe3c8cd6143e5b68ff8b1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9080
zcmb7KZERcDdA|InC`yz_eOR_^UD01+6j63#Df4G+M~-7VmK`xn)JOw!MP5obrtW3V
zCFRInQ6mLbrppj{scQx45M_U&76s<50p@>ohXUJAvCKy0#_No>+mK&VVgo_9AKUYu
z%S%$Uf*=Rbx#yebocDd7_v7e6ON&oH`rkYMr%Y`Vgx^!eO0G)b!HX!|6k-AsVz!w5
zg6+JW+0Q$eL!`Rng7dtKxkO>wzD)=VG3Rz6<{Gh^b4MymFgNdWqt9dYG0&_w(sb?V
zGYe8CFXc2Q&Pci}CNq+*i-TfHPD}YrPGnjpqvdmAM$@urrqL6!PCY3pC-KqI&**0&
zcB84<WVq6boJmD&hWBY&@I{FwD|_J|XQlFB6os3@w7_gQpFL({j=jRPbJ`WN??=zH
zJ1k6lVvfT$=AHJS)k&=}7i*ezM?8kFwwup9=rBC<a*ioU9lsl>93YXFk~rL=;Z8GY
zUe<pqj7NNiH<3_LOC$_mA~CO}@);`o6N%UIQl`=qNF>q<({mX`l~oPh&XkhO8EBSN
zRm({^MN{=eg3(zFXGYO;teJ|PiG)tQ$%K@{Zf5d1nTGk%_W*x_F*O|5Wu_0y`B_<&
z7Zg>==~otq$7KCVPRkBIt)=AQ%lUL#W@AiQkcYFpoKa?mwWKVm!!xp)oST=}m0{VU
zojy#&i)I%Y-2>04pm!nrfpEX2{reX`32j^LD2AffpDB#}t24?vt$~{e<3=l`2fsnq
zNSK(NIb#myns!fnh&!jf7C}HH_jFUtGu?E^9rJD%UUf2GtO+%H%*SiZm0B~e`B7^@
zpP$!SD}61z7KpWO7g$@REx_B_V{K>)qAi43?Yyl6wNBK6yw(*9O?UD+&R7R;>8|v5
z^4hjo7wehrj%+iUsto^G3(g>&vI)ZJ8t11W2vQ~^)~+_{^TqNroYQ4p%q?b>BnB*s
z+JelOl9EMDMeUV@KBwg~DbQZaz9J6f=44Z5ax$A&RL(8Fq^716;+u#lsVVUlSzSn2
z%jmB}#TPKi61-O`lJ|<p@~SRMqMnnpd`X|4S8~!!MiyzaVep+S&5K%El(UKs<`ES&
zt?6Q3%_$i%C$U*MCnhyiMGQ>N$*C|uosNUOpouB47>0;xrp=4AYik2s;$<wt<jmsZ
zwd2+{mO@uDmsyPZ48dr^&B$D;3@1sD;l^px%B<nfF-cJqW-Vz+KCWIMW^>^0!7^xY
zQ<!YaP%BT9Vg_xM@_|eR&jewL*qW!feWJ-K*S>_17Q&5GR)`B33rmG5qB&1rG2an$
zZZZGse$^FoTeI+MUh6l$cDJ6G*P2(i+v>TdNJx45irqHFe8aXaRv;ka>1%h;yybln
zSn)I~MEv81M^=g7QZE=@MFm3xb{LVwZ~{9)X+<TFG~7&9Q!;zT@SBCif+AnljV4{0
z=a48Hu4D!X7HMI<n9X)lwu`SbqiGq#0ZwHi)gAa5)=%AG%JxteqHHf^`;kSQhEta_
z>4ueb;x7@25%dwx6TAYvgH^GQT5UZTp|2pjF5K_f_Eyt|Tj<{TRx>_3BOiNv9=e5H
z;gx+KKK8+5tKYukEQODkcAh9W-}2wvDXw&Vxc!6etLN|7O8rMlJCCBdWxc2G{q65=
zUwLNr*;3C4TAJ?lS_7ZGEtmR_m3oh(x%pmr-^#*=*FLy*J9_u%-J#O{%cb!11t)&;
z-j2Q%_lLd@e5+619xI85OFNDfoPX)#SWUN^%QH1ul?&{=DnZYoa#NV9_Qi#Xs?4CP
zk|wHhV6{M-+71Yf9JB}8{Mz>~%WS9$gW?px>QAdEphv|q(?S3wYU0&7DJKI*IDqH?
z1z?=-D${Z@U<fS1p)UFuAg&Hdlag9pj~0<qDQ<%hFNs%FZAOn84*Y7w!P>E3lU*Vx
zMQm&kADvEHA#}?oC^fok)ZWxFMAL}lt_vV<TgTGatqV6U+`4q*(#o0DZ<RWtrNHo-
zcbH%J))$?|WEf+BnV2JLS8Z|IMD^0qRY~9A`kjwnzbM3QAb~4?AImjRVL?uAMTH|G
zObSKHh`OTAX5^u(8oPpP|K0T;`3Tc>DP040(i%+8W~9X-J+}xoLPXX<Lz$rr4Ix%$
zD^NQ=b3t)NQ7e-+(J%+Xi&rJ+maN2}7G-FhIi}@j=d$^nZ(vrLmC_mQs)%1uU>$PR
zXK6f)R6yb;)7CkoE__Be2|Lm=BPFi@N(`@>pHF~*y5YdNBM!rz&tet1juq^j6$~fY
zKRU6zCHgIxQB$t<9I>h6TbQCpkzE%yd_wR3+oy_w(Smc`>wmlX&E`k=<74medLUG2
zHaUdA#!4hwT$#Or!c74jumxXC2IGzq8ik}fx8{-ovs;jY0<E%z$kwlN$u=ygt{Tiw
zo{8BG+ev`)|AB=nLTZ`uM?gaz=D_d%{I5)Vf(l8P_z>BO*+4DQ>@1UKanTL7Pac9f
zBhN_-3Rq}Ck+`_eY1$PX%^-7Bykr?gL|Y%s@i{F86T`F<OiM{g^J(0?(C4Hq?3ElL
zxv0u=ila$NA>-FKAV+7TViqW*Ct)ay^YA*9A<H4j!n&53sz;)}h@Fwj<mNZ@LVE%h
zQ=&31LBGQl2+aG6TFSyB;Wl!_ZE}wrECDt%+*vK7Bo_@Qj3&LdrcN+RO2y(WPi1kV
zy@oTDrsEi@A4LX!Y867=<={XuII!Avr=t`+QShuEJNe5Czr3*2QEm?x+rulPw-0^X
z{wM!DQE=Z2^sMYJ1`e-z4?oIWRh)N%<v9EZ7r;P(A>o2eb@8uVb;lhsJMl}*LAmMy
zIMK|8G*PurY8^2fpWy|=Rc(h_@8lbk`bdS@)h3LoT2XalLY5X$ePGGvm`(L#WS};}
zDoq}Z`z;}`nEk9RrE_*Cq;E)*KdA`~pwTr!mOw+YJYBoMM^x>m+OW9Kvalw)Ypqr(
z?y-8+7T8*38Z@NIPptFh0W;bEj=LweHw-~_f^3?G6mN|=W3I#YJOp2D69qL8Z=2|A
z7=)^IhOL03+SmW+JWx6OXzNx~uAZ?ihIm4(s`o`<YOl4zORc!!eO9gZov=t%YpTw-
zx+E}u+&MwYxgovO_7%H)ykQ`!UonzEx*=Iyq|)jK8#+;asm(fZ9_+39L~w^1P}}3p
z6Hh!k8r*7Kb6sb=-CD!?g2VhV_Yh>@n{apv)IV#xXma_Jbw5xk#+$A6_Xz#M<AQak
zDX5XOT@?DTcTpJsm*b8;A#ICz#!U;{#l0)TokAcXDZhZoM@vdJeNIFqlq91gMS2Yv
zRB3k6a3L0woHLrLF$JR?b{1Lmq?;3ZR!YK3=df(FnYMFA&spYsD{q0B0VIVQ0ixD9
zy};dhX>mq2`w}$G@E~w9%cPXiB<YEqtm+yw>^aTw!v?u3u~dRg55tpPG`H2FOY>QT
zVlvt+bb{eXX7h#zKSf%?@FS2@+oMl`VS+e|iv6~vDELB=7WC_edt77lSk{@$C|Sb+
zy<|A&l-W5r3~-?hxBNPGk}_Ho=?v)*6^DYqjN?|-p4k!!a_zuQ;s)6%%1$FQ>}r;s
zr`iR|&Qg<)>=X&n9BkS}!5`Uh;989UUm311SFm!5I_0cxIJKEX&Tv<bX-gPx^K6ET
zPtc+H1=2Q~6L)FY3HreuHNEqQvU?T-^pC(b*M(2L+duVo{NB6metYNA?2VnvuNT_~
z3hwpJ9p%oUV&_n~^NC{T6EJx?M+?67wvKY!-eTL{l~?cVE47_1xbC+G%dP#z*8Y`!
zrPkqs<GvU!1lHTy3$FEGS2?)57~H)wc4wp<JoTI4sdax_*&i<Y!z&}VJ3jW0-f!=^
zwe!Z#n|<ruy|=!1<9jQ5se7>89W8c8R~Jg%qf5^9-f+2hxY#><+ga*8cB5&@z1}xa
z?mJNIJ5cUBQS3W$XJIY=?X|uWYke=@@GrTSZOaEMPS$(9<(|P}&tR!%sN6GB>>0V8
zE{~ooj-D%zUMh}WD)n4ma(~$*1p6QQgtqO=`&Rar!z0D;NI8707(P}G94`irulMYD
zzx&<pm9EvUa^zSsa;zLVS&W>#yRUpYUOXKypMJS``sGs3)ROytc;Lgq4+dA$B`o6a
zD?Gi_zI1BowdK9*{vDMwA1d!2E$$w@oiFV^{;~hWy1#wt`8UrO&TQ=9HbC7~eH=Ws
zu?wz8Ik2Y~*t3!-1xD7qBVYDlvp4(xM<>ntkAc&nvj-jjbkKM9DNnLuE}~r!pm|Iq
z^Dz`)WNztvSRlsr)S<Wo#-Q!m(FyK7a?^`kV)7O0B){s;!I4-*c#&zUme);dF$z#|
zae<Gx>Lh}Mfz44ik4z`~f%B`$rk7APw$-s|<ws3)P-sR6GPvID-M4yg^uDv}O?Sbu
z9tajpcQ;AvS&6JWx`qcIA;Ym@_)zy&iW1=rus)?Kfa1~)2zy`@na0vob!8~JYGB8l
zL-}NIE7b|p%6ZuF&1aC}2LasU>=~?Wc=9>`QdK#4!;dBSfXCzJAUjw2CUMW?=bA7>
z7Lnn)CNp4E^PG}O0kL%Dnp`nnS(-+2c%sEPM)8S_Rx2B)g|;`4gyXasol()RPa?Z6
zz{goS`{sAc0kIemS9;2U!D3)=J=9q^2R!*hAhaIZcI)Vkqs!OIgU5@5$M2?>j;@6+
zZr%F1%hBfB5FAYv!dC1^{Ei?HsW@S&@uo0qL)_uWvv4SQXp{3?4i2sOT<nF5a99AW
z;=Gcwe1JNpRka~48y)9S_K4Hu(pjv@UZsp!b#r8hJ&&eMFxsfQUZ*+4DqvBs|51~?
zy7g9!LFfg!nPeNpZSXmIFERBnSFM}8=J~vytA;*Eyh7tz?i{JUdFT*1b%rsGS;ydV
zMgqgn(r}E+Xg9j*Xj-R`{|tS)7uj{;53T)c-hT5C5Ql1_^YQ!tdlYU8u-ol%TfN}K
z!Mt%>NO<-PtQ|+(QQvnFQNp^2lVhs7vSle-qa|4Eu&S$qSN(kfmVHHBMd9KhK57e*
z+%yVY-YmcdBeAkwyKY7g@H&Fn5sw-WYEhs}lof<N5X~3qZDvSI57Ac~Fx^Nq#1RoS
z=94Nx6FyEHsD>k~1tU@M8PihWQ7Us7!tL}Nq(?y}qP-cOoIwf7Je!4qVbM>cHV59&
z42PbdX`o+{I_qI_lTC6Ry*im0a3<YCpuOPuZJ=X4)KxhDX=q#F>EDLBK55$q$8mZ0
zPjaR{S#Jy7^4;()Kk?(h{aW$!7NMgD?r2-6g074<m8{YWl`6zC(1SFKMY|M4lfqpG
zY=BS%z2JA^oo<S_hNn6>wOO>Zz6WfoqZXR5^wL)FYQ9_&r&*&$DL~=`=goohT-5WT
zTelAjx*ct>-c|Qhv$Y5KVq18mahJ7^DUwm1YCL+V`c6<iDemLqFcg4HjC$FA5nCZn
zQ$6skJ)1@$+*-Q<a5iCdL#la{)-EPT_G8RNAQI<a=w;n5;!ky_5$wA*R*BUd5n)Xt
zn3oji5i6(ohj>NC157ppg)C8Lg5n_(Q#2i>C7}i&!+hb18r@&f&lS%2rkEy?kldgL
zCz0*OK|yu_mkYhDwQr?cEz&FqH(d}{MA+#dv%-@OI3%`l;FuFT)uGSq(E|oNdSWmu
zX3}(wUOQjUvJ=Ko9bPrq+|gqh4bN-(iD-5BkFf}U$ovdqu+}^Bv;1oR`!Bxx;`@`!
zlRwM<kqguty}N9YCUy=(@${V0BnVqZRv~flQe1fzet{{a5sYADJmKkrULw4vsvvP-
zGG%X2R-gg4E9{5#by9Sc<EUkhn-)%rI_uSG!%^Nwzy2<=>%xCRYCDEjUt0?uT=O4<
zRenD(@bkSNIc}f%g|GD3(Y4lN1;@QW@GtUjf9HqaDfW*1G+*i+S)(}Qr|q4EQ=f$Q
zmqO!zvv>LVpC7p`{e%2>@|{z^nEQ3;*YaI|dGO*pV@s!gH1@&X!q{49{NE9s%-!qR
zwanhX`tH@0+<V`JjlAS|$NjlS2zG6FIR^ak3y;uo$)*Fqg<rKqPakss>M1Ysq!SHR
z0uGqEfkH^68#vycn&59dB1Qc+=`<b**7Mn{#&UQ}#0#L9l(GO~JdoB01*4mE^hvB;
z^Q{`d@hNm}Qf_teyT3-CPO1Xx8Nk@YbsJcKM9d5ii&W~TEKC`HaoR(ry_C@-z<6Xk
zBivvF4=g}g8#2R_NTjqR-t2c!6M+F3DaRPOW#&@^MJ?Dt$`J7|-`m;KR3h(;U7~E9
zGO{ulAqOKxZR&_5l~R<^`50Nj{7U!<OxD7`f(VQ<+MWKl$bKMvZuhvn8$m$~Uq4rP
zecAEmcT3Lx4QHF{=yGgBz-M(2|J+GEq}s-S!*yUe_fWuR!{>2zP#K?%t|r%!)oTw0
zd^Wmvx<>DGJQVQx;<U}-3T^njF5lg;hXOudbl6=j8(zCBd<PBqY<OE-p%pBH&&Do4
z%Bc+jpH(#Bv(amJ1#gdiiJ?LBEOdf#h}bQBjpph<hsZi3=rR9s!c7nA>MXxy9+Tzg
zZnzM`d(8YNjT7uW%p}g#3Ar{LHk<7e;oN@-1OF!MfQqy?e`&YdhBpLc4;-j&Q1wgS
PgxwbW0#9N#b7%h#2{9Vj

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/behavior_class.cpython-313.pyc
deleted file mode 100644
index ae87773ca68e75824b82e22934aa030ae1b091c4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3386
zcmb7GU2Gf25#Hk+$rD9NhG@&OOiD*Zg2bvNZC9?KL^b@QII;yv4jDQsq~?4gm()q*
z9lLi_BBKTLR;n*`gBX_D08-EbQQuOuK;@!<jXbq)rs<T#MlFiAdB~emB}n?z&K`OC
zi~P_fv9o(KJ3BkO^Udt~d>#btldb=%oExEkvWeZC^}+6K7~Ds3L{OZIJH|M|S=2!s
z7Uc<VQGp0R`7!5B7ja2URv2^N6p1LIHq?dU&UO@c&4?HInqJb7afIE4r!c#U=afZN
zC)cuyVZ6MkWDAO^6X_qz??}3aC9QBPt7y_}T}nzSCeG+&Nm5KnkxVs*BOcEUZ4MLF
zG^Ei&PSIjGpH-IjRP`ECR@d_p=}IbPNP6B>bxp}iX@ZS8i6~l1&q;=o%V$+B<H1UD
zPRfHN(wlQwlkRARoNOeqrVv#(?nuLushhBPR@DrtZ&6j44Y|BzEahPNf|OE8IPxhw
z<3yOFqIIU*3W4#-<4jA|yix1xu0gv~Fn<G_K7+v7h&h_@Jt*!V!i;mq#cpQC9p`(2
zCeQ#lx*g4fqvOt*h5?85#l^nN3&az5w<FRR7l}84zRM9`Hwrh<*5f_?X)`FKO}3-u
zB)bpVU@qmctSLF16o79xW(2c$cL2Ie+<i1{bI@x-0(~F&Q!NSJzHWFBU~AT43uRE4
zn+WsNX%zC9P*IjuO*LhidS&Y-a7t#LbW*{<*)$c{{?wJ!wX~X{UXv)QCf8>Ta?Mg%
zHkeTov}KB;2?G-&iVGR6;YFt0SXhdVU}M45^U-U13P*1h(rHXah`NZQd25_iXQO%&
zD_V3GYstBsLKdRfevk%ytUc7)ev$kV@xq3*^LTeg>@&51kkJm^SIcNy^sTr{V%zG;
zjUPS}2dV+7D10QI*#$15E`rSh(UKgS)td;^@^c#cBYqZZe!f^2)wl!~V=g$-Sa@En
zdtP%S9DQ}Z(v8rx%NCBYosP6v<B<<AO*k&r?X&fyKzD-cM$<l9I)P$M_IRHjYa4$K
z*|r_a7vfHvZ||$|2dqA^uiqYZ9k9`lZVQGZiVTMzjRp|XM`C_kZlA_ow(f&x8izYe
z2)7*L9SAid329DENbqszi|-(${26%s^eJ0M!f9)G5w2VL19-O8=Ed6V(LU`$Q*rxT
znroW94y(n$#-k0+gJNLA(bk0G?r%Hd;t&UswdaR~3r;E{q`41xsq5%k1dx1yf85>o
ze0|MSuWj@HzZT9QeO+V1ZELitgvOPczn`-L(hozvsq^uMI`vpc$X?(v_mSXga?j!I
ztu-7Hyb?`Fqj2K{b?1~kLm-N%zz~4?4&?#D!s)IMfh<5d8Nl6@CpfLXMLkSR0n9NC
zDzY)?F~WYr0*U&twkYe1m=HCE4a#S*NgcXDg}h?UQKxFK4S7{F3A;_|&Z>sVK8aXM
z8E>i}!)LJuFkz)E>Z&0n6-}JWr`0S5s~1viLKu=07LL?iPfS#p*HtatKv-Z>9xNg3
zS(3AC%B88OWV13$YOsQ(LY$(`G(%lt!B+!_F=S<JnXnKd5?k5*9PB%EQWgou1fT{3
zz$X7nrquL8xv8hfS6fa!=>1vmYIq}7Zs{w!|JL04?(BMIEwi3m%Wb|==@=+=3{?HC
z#hd?XJG(lvKDIWtKDjozdHsW{<-lMi@KP!8($?gkUM&YE%Wb!czU|J?`rV)3-RynV
zd7<pT@wD^8YUua#znOpBTKUfCgDa)Z3q|pX|3<a3ZS{P)F|ZN*o&STW-v-N}OWUo1
z2f4R%8>8jc;o>XXQeQ<n_f$H!HSlQY;m}t9<5#zuOVVg@<e4}4@fqo!bNfWg1JB!@
zcQ33Dt_`jauMKay%V!7w*z)M~!_$uf4+D==mFusUuD|}|dvBD7<iDJ_^BIrYLw^s3
zD#1u87%5)6*RV2CmA-Y)Tk;009sTz_E0a4SI^i#RK6j!}_r~ZieIUfR+Z~~ezF*{E
z1~S#=mZIlB|2~6G-sB9HeLf7FxgHTd9CAT_C~>evU?mP#V%=vD0`zFJ;ZA`fZ@Qi$
z5Q-g4^z|h%_tCWgfXC{zfpAYZMCbG#z{ISI^RO-p;0}MXW-M_HL0DR)!yZ3_*e}da
zyasXSQ!|=QunaM!6C>=h9z9`jM_ynZq&Uc{#$3Tvvk*8rlVBX?tROI08k4NIOx?*W
zWc@;(c0A`j*ZKXh#2AHc8C4sbD~$p8|ElHv)9;;rKlom7)2u`<m!g*|p(~}(m2Gd!
zPse{UzS>=B?=Q9YZw);4Uam^r#gUJ^!5t^+y2?Fi3=oD`U=WMda1*P8Sj<5k<dNl^
zo+@P7*elCFC@9%Fr&*TMDltqbH?gM6GGVbv*i}>BFbM^|qG?d1vNG2IzEz|MDK%+Y
zF-wM+=qV_dm`GAqOp~ay1rviT_0@~1BnCJDV5>zx^VH>9)U$elu#Q2D@lEKy`Z20@
zMwf3C-&*BYepD8sUkDCoXh-xr{j27e2*{V;;GI1?2s-Py*7C52x@9?~C&57Ly24EH
zAnTZusSC<ZD93B(I=@$*MXd5HvR9r(_RF)#94q~0V%7eHJz;C3U!&Tl46~>>TyyFr
x!Y+{=%R0gi&vD#m4xSt3c0Gvme2je7zxl7|>?bJpxvP`wc@jMT1p;bm`yZi5`9A;v

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/human_replay.cpython-313.pyc
deleted file mode 100644
index 1786e70901f1ae0130b6edeca7ba39c583906961..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2580
zcmZt|TWk|YaPQ&sBTgL%2_&K9NJGIu>_QSD5<)~FJVHUpCnF?Mj*D;YTshyF-E$yR
zmH2D;B9V%4q)J5nN~KDrRjcyNSNm05BVny*g<AE4KWM0`ezmjb*o45y+Ij5E?A+|k
zte2acBM9L8d;eLB3POKyrVfD>V)-Btw~&Si(gaOBDG&j$bW$9bh@^6vd{Q0{kbsId
zp&q0KI+3PK1O}y=JyCLEJXjP3gpL>UhLget+n5=fHteF|xkUZ_`e!QfWoq6vvARpm
zxQ<6$Tc>75cO83Xx9U1r1&QqzJ+**IWYTg}_<2)U%^L;Xw8(BXj}1q4vwXh~7c9zs
zSx(kfO)KNkL?m*;nZm^KsBdQCtWmVRhF9Bl3u>0QdG%r=w2Q-Vusx%?)0hy;#58hI
zr&DgxHg)Q`g^Q}^svahJ%P~9*su`?eP@PeEcvA*d3&b^x8EpFT^H>s3c}`Bp1r|DN
z8<d_k2+-liiJ4l;YU@h=(DG5>Z=neUc}GOl1R`xinn>h{z=XmtX(Fge+W~9xL})<d
z`Ts~JVc;UbDL&Vv1xYj)iig>TH(kQhW-w>bDSkhC?J`){n<-%3G4hxx1bao|WZ()L
z5CV!vxcmi3p*o;4G)a@SfTn1{gyINkA+=h6#W0OhV#7y+NY!htA%U+6y@v)@bO%>F
zLA8a`!Z2{gO@!ctH`ztBrn;RYro|mAK8<OF&O?pYAjRWf1F5Dp0$|f({p^OH6!_u!
z@?n+7x9YN&xM!7pD>^GtF^Q<yijHqa$n8(9d#PEcwCLLNOvu5hPKZQ0Bc9rVP!m$Y
zha=VX(t;M|x6d(82jJ!!-wS#*-7Jc~b(&uzZ#~wsw!uI`Fx0Sa*tQOhX_-%X!U@m*
z(YV55)XEoa!?RpR=RL}zbq_ulwj9f=ZG<tH*T|N7Yc#Aj5PZcTrcMmUbo16V-yxH*
zGtDASF|Qjr?0A#~S6Z5(YjOfwiG}zk2q~1WQY@GdDN`VVtd(O@4tq>=DU%C^H^l-L
zg<-%X8#^p$Th!xR2s<XdVtG?>naLE}S<=qGZWAm_$Wj@w3cxbSG{uA_aS;N+)BLOG
zrJ3XrOfNyZClAAyC(jhKSxk-)YZ@mDK5bi*NjC$-Cpn3o%v9bWmy$dS{&b_s)jgFc
z%&=zoT)i<-SkIf<G4YB{3&cmif#o`S(!IU9IZ^G{Rqg1ks#~kQ@#oFKSY%du)`G&(
zn~@ulxq;im^TW4C=SLR@mwGC#eGfy4Crukmw9?d5ZST09o6nW3rS+Be-LuDji?x5g
z_s+hBeRoC{MwT`{>e^H8+Vi+~`*-jEF#hfMy(<p~j#YMC_+{YO(&e9rej0jkcH+^=
z55Hbl9ynIsaiP+y&q;G1lv}n|d$ujw<(__E$LC|!*!tPXzkdZ#To-5*qQ2kOaX1>f
zzgK}Z!*g8+-i!VO`4Na)==@5L`xPwkO4dZa*6ZAnU)M-+`!&F)P`&>fwt?4ewZJRh
zm2+c#@D^%HANWfveX!{S&vO+YUe1BEGV~QfBSL*1nD9+1I=7EYdr>@i+OIC?nGALe
zV!1lhVh&0(;&%!4aDn=bMWCk<iNO+=m}J3|_cj-7<$@6D1;k{7<S2{rmxJmVc>i$5
z!zK&Vob2T4yZFLAv2fnF3hxrQr+l{YisnGx<(tvf4;S0|#-dxT(gQ}+f#o`Sp`flU
zC3A6b;nIVh!{sdpDxD*<vBzzjN*hXeq3cffLif_<yO%2+`zvijvysQG?Q>pfZ2sfL
zq4LH=r8PMls<!o(+xFZ&RBju*x2e40&}{f|Z2jD5CHBtZ)p9KPC^mL?u-rfPAT+iN
zC!#Omfox_W-G6&@okes$P>MDNI;!iJi-yf#CE#MZp0x<|Y|FuptLr2T=7iS-lc-1d
zzTr5o=MNeR1~UIIK=^cI@}%q9WDh6XbjTa%WOT#xh&5UCFesA&?yR3LJcWdptH1Es
zPkHh`Mb+-U>nCQfmZTe>ROG&Aa!a7K<h?+Ep7jO;M}>Pw{y`kq+;9)^APefc>1N;n
z80Ls6@XT6Ht`>E?F;Wu#NJ%tCO5){6NlY3rrg$gSVtO@B7CKPN)<JRzRCrTSUQEv<
jK@gscvM?qrN0AWu6?OfIMqenKg`+}g|KEsXKg|CD?^>8L

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/multi_scenario.cpython-313.pyc
deleted file mode 100644
index 2bc93b735098c57ffe89fb4a697cd6da16b85db1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9406
zcmb7KTWlLwdLG{I;$5UJmPR*A)P<5_%h%Xm+p;5Bwj6Uju~W%A9+4x8Yifp>A!RAq
z-586-F0&{QX`9p(g4POJAX=agD%uC-p@`9<Xn+PNr{fKYS#1&QrVs9uyfzxNMbZ9e
zhC|6T-E5DbGv|K!&*lHV|M02HWyioh+Ww!^--I#jw<w}HQzh{9CImjj2#m)F9igAn
z@jCb#ru3H#ya7jP#wp_^6K_(&rYZ9!3va=(FxH6?=4Onr%v#0_YFoS&N?QpVVJ93>
zBX3LUBhK7E=rHU`E}cygafYS&6n9~XPUq+>$K$`b_W_<r30Xchm&-D^P$?kbbe!h|
z0l&-fcNkt6#5tD1c`lvia#?$Z;j1MF@kNGaaY!1{Dnkmi7Z;-VIG<W#1bmLmF5uY(
z22ZAv^n9ATi$njZ1f5NBtUW_RCe0>rp2?)?l_4R!l4kHlG~B%w8K@vc?e<A_f#FkG
zL1`_)%+pW|Pjg%bpXa$n{C0I>w@>2S62tSU1oVHK5*D~zIzb6pE^{02i_;l=ZpB_5
zZ=B1q*~lQ2OR=eJicV8JBQRMgNee4%97YB)d>+K54oejZjDhEJ$%RY~>e_Epi}cDI
zqt>L-LUs|VKPQ#T#Pus`Q_vNf@NFo}CK#T|(1HM^k%Z0<2{2^)h1V|Qzxm0}@%j11
z43oqcGl%g+ZZQ*yK1Rzt9ns0g*FkRZwVqEy?Kfy1!nW%4<LYiueRC!9^v4kR5Sztd
zdttnu(D8;IjL`GOS<|c;?ZsIOVd#Y~VVt!d))S^??0c|<Dlyn-5VI4OW{h_bR`8vW
z<AQQFC8q&mZiv~Hn1^u8dI{&OPf0ZsE+y4ZH1L6>JK~WW>+i?MUg%zKP{d4W>m%E;
zD;b7j=|x61^Gr6!vvJtbRfpz>w*S*F!21v*u~;>0FA%F7ki8U`r)~rr)2k_U^Yw&5
z%V%{n+HplaXfoZ`T-_GFtp_7v(s~%H%!$?C^bzK$z8i~Jre(tt6PGP>bo>tN71{QB
zZhoHOFR)9pmCY?u47(&eM)e|kz5zbG2ZfEYGt1K{mWp%id@3m$l1x@Mq}Z%%R#PGd
z*(fmS`Mo*<5)Y2!GBZpHaNviTT#{j#C3GN!J1fKEjBp1I&+vIJ!3?W|P!9KSMj22#
zH8;$~;UXBGW7zn@BF*0!MrT&JWQ5`8*E5<~kzFt?s(M1@?)uvy`dq013{iLkynEPJ
zzQ&TTOZ0V>e7&Ntchj`VOTJf1z9XXV$kxik0m(O=x9s>ESF`uuS$pUHyKC=m<TmG|
z&}b<%CWgkgzW1<03XMzt3wdj~rBietFIta3g_g1U8*2e_;C_a?nJ^H>sDagyy4e0L
zh^m5vjZxW7YFzbW)s3^4N|^Sxp0@BfM(SYE&C@(O1w3l$BCM1R36fd_!ONa3!!M>-
z<>FLR0z7k<<`WbT7YetSdRxho4X`hura6~^OP1k-s5w7)uuB?IxE(wJwU6U`2tKk4
zD*PVH@m#59)W~7*U^$wa%Xmk*JMzS0bldZWT{~uTzHfiezIyon$+eUB&#ax<7~AZW
z{6mkd(X!i@xAUlWeE*rWK!S3LmmvVBxW0h(^-me0uRTRbnDqod2CXBPgZ@|pj%k0a
zku-qt4b$jkFp0<gPzFTkP&9ygt1BXujXV=yikK8Z%4XQ0AQah1^GQM0b91te63}wP
zQPO1F@Vr)$dtS2FpD)$}jR+^e1L;~DO4er4+FUsN;Mn@Hjf<P-q?X}F){%01&x3c?
z-}&eVqO~=z&!5e|wc4?It)MH6mfhaGO_i|`+R$9}ru-aIkc|5(IW**upb<_05}{#Z
zC$X98_9hL49!;sP<PC(bR`MW8Bee7Zv^0WYH@k<R?Mlm_Tu1;jN^8igaoXCTwIfX5
zUMdOWOezNk?PX<G%Wh3;P-%uuW*1~L(#4eajYkYTx<}NtG`-F69HlL$Nyx%!k>SH+
z1U`aF4TD#^p57NUz1;V*C9TB-1E@U)ynEQcTQN_tVA}A0WG_2?JIx0QiH))KI}f;Z
zt{6S@&?~i_mzu}(?y{q~a7}WwZH$RsC$?#+>rC0{UG01CYRTClIy)XYyPk{x(%H3Z
z!n(hsD>~Z#j}4HAF?_gxk$?Z2YrjgXvxw-9d~r+6pyRDLQm@^2r18XGU>F=aBOE$(
z$;mFH)3gN?lxr|zRM)F|hsg$5*vgVAizK^K>^vucvP-9#c$P`<=u(o+SvZqSws!TL
z`>~+h^|FQA2N;h)Yr-+`?qNIL!0Pq;Gix*VsWocjjm?-894Q44i^0QN)I(YdPDtL#
zym`m%TRkJWyEc|Y_fXL~^x`r&pf%;L`vdS*#WIDlaB2|vXHBG`-pz$nk0w~wOqvPf
ziMw#dVCxTq5aE<6Cx*4`6LtM|%-RB+s-FJ@T*q+g>X*0FDG$8@8L?_Pgl!1s#9B#f
z%m#C-aX?0ZjvUv&v<D~Qs?N+t+6b3&4?!P9!&M{UCT$JaAw6p+jWHy-8b{i-QLqlu
z5p&gKLzJw}fOXEewS1762kqZ#Z5QDsU4#!VDUc}J0LmPpsT$|UCjJR#>Ynjw8THnR
zbno4ZtOsKL>Wl&~qaf*4X5=MJF?1W%xERvOHI8%+>xQvm9qAg@5sgGh`3oPnu)dil
zttjaunlw3TH+ur4NcyfekiHmFxiyZ^AqlYlnFCq|=_i_rkVYrjzzo6)iX#J>q(BCM
zCfFe9k9BCddtcH^28kB5+E?JvI?%qtVqJUrT8Q+K{+nH-hqR7W&@|f!qwLZ0_rCk|
zLPHEKPmLoRwUI%eVx`XpxL8_q{{}^`?on67Q}Lw~a~Bv`I*s22E}3E#eg%}0!)aWA
zDA3PDiU+oNE|*FtfC>Y-WQ2uE-cW`8f!fCf4zhrj;J_$>mN0lqzy;>59K#|OX-6av
zbPA990Hu+ED+-7vBF`uh8qVS)3IpF`eig<ygqS1rV85H<(Ex$s<raa-A{IWEVo^;R
z&!(VfG@%R+*joU8tYF4zxuL=<HP&EYl4;qBz?;IMe3fjN6wqgZg2~Dr1!V3|(M4qg
z@XqTIy==+QP$ers8_QrTptw6IDSn4eg0@s(|J&{=4~VJ>p}hp?IK^}9B5ES4YH8~N
zLnqQOX4z30jDl}%LJIDGMGIp*(v>_qt*Ra{t20o5mL1LI4glYzut#BJ2htuDgN1CL
zqqFe^N=UuUfN}wvq)?+@=wu73n@TK40<tZ&nBn*=kQdoj9j<~5cmwn+n-=dN*|^kV
z2yo}pjO;)S#CZnUPh?{v70=4f$|TUv5@av+#^i+?)b!aa7pUu3smbx13QkmtFVc4y
z*|I<jbT-TL1nR{CS_;Kv6M%S-s0WM<)Dl<$I7$F=m0FganxO!-FPq|NP5>s%EN6id
zMnWp2N9SEOXSj4KzM^6m#FY3O$Wxf%Ii)W!-=Ov8CsE9y&Pd%AicBk0w4z}maw&T&
zGedJ$1uom*<bcT{yCBF$Wk6QsBazAuC8Um;M;9^wD(c4y)RW;^Q1Cj~FiKScU2zB_
z*KsN(sZJVMM_#U|E8|0jP+!77>X4#-+JM=berpW_PrTo^)>mv9*d!(2p}eK+@U1eE
zqowQ*<|n`MhYEJl-<7xS_z+cGURy5Who#V1DKxts_$>TsxOncmbex1}@y%JucdK}t
z5q%7_Vs*dode2ozY+ld1MC&V+ZVJaYl9De9`2(+PC8dE=Vq<Ud!ff$YqBzSG&(G%_
z<zQPWI4A}OH)o{a@w~n4KTz^VM1N$nPx2qiTfc6_Hv${X$1PGTAcv;%z!><2A<^3R
zRiH6%D+j`*K+mH<PkwwS)K>}(ilM>H>szBR<orY#@7{Ds_@VXA?a}S)+ZVT+^HX2>
zT1vi7(buW+<g&ME)n0CHEd<KYeoM3*?kWRyYduhIY29^W{=Qwz>Tk%Of9AtF`Zf)l
zZ<O%EB7S&{*zWqQ@6*1Aqn{4sFYbie3QHSrS9;wEHW!ZlF}12Idm2lgUeVLL>Dazj
z^z=%eGfxayW6!P`3pT;1#X!$SeDlcGk%!^Wk4ga|f1&Jjzd!lj<R4Czoxzf`Rdlu%
z$S<8;Pa;_G;BG(WX)0R73U=Z-w3QaXi&tSJL{J!=)2BZI{h^>J3{aE>P!t9z3K>GN
zDLrcgHlpws!l39nJMhAazJa<`7qeD_)j&l%I%cY<12mblnq3XVs+gjh1U9rif?qe^
zM3^;Qt!XdT2?(cNTUGqB53#i&1h#KvM#75bSGPWxgJ0;&HB&`5u_~6?8-FLJX<fEq
zU&%UXr4u8oh!V%zkhZUbUvB=@%|9Qja8VY}5uhRu={x~Fzow)A8!(D4R>d)UqtM2X
z0~Eu0Ss#>Y*~`^JTB+x8OjW7@(n)%Tzzk0Ykk1D9Fb!cQgPLSWeYz?U(yieq`XJPE
z6AneTU0RQOU#)LIIc~!F?WJaVw7R5A8=)U$FmVY+<^%j(2j6IY=n<B(#t<{Daimvk
zD~8l&joXKi8v)}F)nr4IY$V;9928lHU@W#+^u>~~(HGOm5aA*kj_Y%98rd+8<`Iq^
zda;}~k9wxFYMvI@R~=Rzk+2$9@5E|SV8aAFrw~=^E}WF(TJ;+tkRh_H)VLVZA2m*s
zNp&}z1X)>PXKIQ;^hWq4qx{Z`WnWSfY3LeP#o?&+3pJS#y%E-CoP({$dU4nxNdEhF
zSuJXBBK5J?UTleMB29#QKWcwXcjbzTrC-t-oLv^RhOk79Bbx}%aUGy>xGxk2;DNIT
zfMD;`)38^d2RQlPdb>1ebA(l#(mEo%&2aC;V1%`f@M-1!)jLc}>Bc&+K}=iCgbu^z
z^*Gk9NbNP<wEvW$9h=v6{J+LDkEyjH{%Hk&%O=J6E}IizK94gzG9B<}_bE75_Go~Y
zUIwU(EKP`EDVC<kGbl8IJao#g!;_9H!KgN0z}=}h*l$V5kZlUiLXeri4)N!V)^Jl|
z=<Gf12CoZdhF(N~v4PIxyi3@JCRi2Fb!b(R*ipm0E~1<M1z3h<eRM?DWu7w&+g?SF
z0LvNvw0z`cmXdwYH~MOti^D_R>1egyIt)TU>g6$@vSRS)KX)7{oc(}?&wmI=8~muj
zqkG%b9|fin>6J}@;Nfw=1W-A?pkP<o%gw=i#FA?A7Jzji1Kk1*v<Io8J<~QAUj^9=
zP&$4$0T6jXAvKCgRe6l7unLp%lqcJuoSMR;ivWlYy)3);7z2ga5dRA5U=HP5SdL2a
zG`zpi0tKjB0OYUFa<UWN1Qqi;Di54!MH#s466nQDnxWEM5?)VaLp+m{Er>$Guw^G5
zNAI4LIv0Bi7$-0u7Ut!uKEy&kOlF$n7ok~WJe|sb)r-3;8y8Z^1u#&<wm~mp=<!f?
zDs+H?Lcp(NGYkz(V6rip=H?>J`?!H!wTB^8&nMB$kQ&)TR<fSW@IOXL3T0SE)3zf*
z1tbr6o3epr8^}rBw6aa0mzV?v(&o`_ll@h*9aW*DV2JBS?IDE(#kH4Z*@;?($0<eJ
zvW;gJ;i(Epu59GyD40P-YGzeVJP*|cWV)!s6e>vyVc-*}pO+f+_H9#yCWXI*b8-*+
zm9_a-)&L?3-r#C-?Lc8!^!DY=<zP!GI3xy#O2I>7@X*#-F?b|z-wC%A#ve?rPd&K0
zes%NW);X#5XsPw2*m`pN>gU&_)~izZ^}Gu(`u&NuiQ<7_DKJtBP=9@X+qg}B>iEq4
zsk=DwW^v}0bnLs*VJdI?I?#B3^5>I(G?lmQSncmye`Z|`J+e0K>RW8Mg7hO|AhOxN
zJud~O04;ZR6`k#6U(crNk#DRVZhPQdcYf5cW?I#+o-Ma_mRbkJ)<LN?x@y~L@7y^4
z(GORh<<>}X@Qm1c<{{)wL&ng^mUnAX8almdf8^=^7k{`MXe#urT`dJrUylL<weg*%
zw!%uODFR(IwLLhne&A2q%kA9{-d%qeUfkMWDYcJ@?PFW>Qv0cbvE1EX>OLxVAB71s
zQup|}tzh2Jm3v1@y~o7fW2N4UV(-O=OT}AsvG-!JcW&K@@;64y9lf74f86|Mt)-4b
zV#gt=W31G1O6)kbJzqL~O+0<Aboz#P`i9hTvta(lhBZf?*|Biv#=*^=QvWHj|5T}e
zT<jk&g)WGp3+0ZkPeLDuHXFAZOQYlB=y+*#LL8m={9x(QE%DN=(xo}^(wx*0FPO_c
z1E0)%JhRm(^&DH9UOk6iO$zjecjrLo#;ac*7%Aw=;m%SxB8DSd9ox&ra6}4EJTqWz
zqfh);bN{nOthv4L{V$saK{A21QeZ#~3~Ze(1_q?S5#T+|LtlBiHoh-<4i!CJyF-}&
z;HK+K-`KMe%;kQ6;%5`9Z$5Iif9(#6!J`|YPg*~2-Aq2}I3fj)O75dY>(OsIV4i>4
z{$I~ptL1;`Z2wOIfy2Kuor_*Ri2ePcbHU32)92RnK8Sr0uwU-7ebHt`;e+AJrw#lM
zq3m=dC|fCo;8T=rrz$oK2s<h2tsI?31|f*KDQZ5&3t$Ri8Mx~x9&j?oqYGL#z>h;v
zKFzWm*eTG}1o?`l3Zp_D$Qwi+LT<8gj^on&DJ7UjKP<_jEak2SjgP6PG)O78Zz@h9
z!-w+I6Vxagu}b(5f#5Bghn_<PbZ7k(E4L5do6Ij44DbCwG7j$=jiwWY>@y5LyH1PA
zkHYZzU8B+D`<>Hf>fUYCo1)v}&oKDxI!8@STM&lN?!2zwbnKz9i@~QzzNvn{JNFHW
zt7A~QM_oTgD)liMJv>Grz#$vvm?hZ(@8JpVF5Kj)MR=iSmMAccMC`JKq7qyjrt?!&
z6bxnA1+Xxw79}(Mk^!zdRkD$)%}KSHMXPV;QKV1QxAW*64f+qjT}2+ZQ0RF<$<|j@
zyUl6-Zy{F){{mWoU4u@id!o1K4m|Z?I{UA&@UO9!f5px`Gf(Js#|m#g!_as4rV;b`
W%MNGRA1J#$Pi&^3Uc|hL4E`5tJ&=$9

diff --git a/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc b/pufferlib/ocean/benchmark/evaluators/__pycache__/wosac.cpython-313.pyc
deleted file mode 100644
index 474555509212136bcdfd777fcb79cb65cbb4d07f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2709
zcma)8OKcm*8J^|7Na|V2acW0xSF$Q;VTp_c(?MQ!Q#nQ~C34IS?KT#&STBd-R?Ast
zW>&TW1n@2LDK_B1I{MIqdP{%;=`lqwO;Mm?89>6ohqOge_*4gqQRI~Vvm|Ai%^?F~
zXa1T0@y+}%?hg!P5sb0leH;A8K<J;6=nlQJ*na^QA0Y=(<S0&RNm)$MlqJ{dlDeo-
zZBeJXg?@;}k)s_)j=rK7)wV1(Rt?*Xf36^O`GWJx!mI0E7<)XT)<16Cu`YWX^~k!=
zz{GNby7kH`CY)IdH7^V?S;ZE`UKp^tg+VHx&0eNngIT<WEqpy-JRqx9k5WJM7_$Om
zUB$$&)jhgakg|!zd}AXq%4;6aR)Y{*(KXUm*R@mkc+Hw_U^-jH9*-%u!e9-DK`n|Z
zR!l0GS|Ro*!Idlq`@kMiW?LSqB&ehy^D)6zJq~#=OQSH1Vs0@`V_&MvTW{3@W_cB_
z!7&98aNB?bULGY1ViHsXT(KU%5*%S#^?aPqK9DbJD<ZuhFL}wMc1mO=X9+VfCo%_8
ze*5j)-;N{D{yCsOLMw<W;K-DtQ1z$CNl|S@UooVoS4>Bpg4xkl(x+06ejNQ;qZy#G
zKp6?u=a{sAHEn0a;FAykU=Rd_el232FEV_ifnDO&u`noxdlMhre=s1*kOZ|K!Uv)X
zBFa%6&CwkruM?%Lln!%u#LB3Y>MRdUrEY{C+H=fjQL)2Y>}ax3S(!j~dRYwk5#it$
zw+(gJ2iJ4#LcqE3l0oHxl%P`loUM!gE_XeuV37$(z}@yzMpb(`3=XOX#N$Ck+B-)m
zzUI-2OCc_zdhlLyBvee+ZB-a?qXHI%0jjIPs!&%k2S;iQhgJF%*k#hVWr>wSnPJKb
zIEJji>(Vw`+bEpHY>h{a!U7;Fyct)kn4YD<IxaMlbr@VNL_YRN;X5*b6_Q&+B8UM%
z<3{V4(x8Hw7=6;6`6o_)AW_K@a07igdUEd={EqKgV|x?!SAE7%c2j*gh%)^jW^ZP1
zox1zt?HBK!xqaqdaeKTqGW$h3zc-ZI%<h9U@-_MGj10_46d<^7I|`Q2qCzzAl{Svb
zYS(u}FYDjs=U>j1Q{5^<jIvQOyQ&}0GOnIiO43%(D0XOKUg_%|l?_Li0aY64?mV8m
z{9ROK(=lYS9F{K0B<L9>`FK5}tE-gjIRMg8B!WX$%w8)_I>9JIrDt^Y9+J*T>9SdR
zx`!3WF82t!l)jW<>a!4eBj~jWgktbW<7PR1<ycp$oN=<<oSN=zdhl+CZlCE|y`Ypc
zZt9Img!+($h)L2UbLE)>O=bZ(eb1*Rz?y}aN-`u}MRxykl5@5~r5E3VD%VfqI;Mf|
zMkL%2M}Ts@q&>J$W!<F8ieVmcNW;fx;%IvZ>gc;MPPjZ8L`>v572d65?!kUUu{iqw
zdoJS@p#>b*Atg=V1prJKRSLC?*@NQ5L6YT@B+DNpSzZ=|q|GEZ5a|w$g;s}b6ef?{
zO5k&$SHsBTLS=DX=mE5p>rfIBxI$+&uYutRK<E+#6K2h09_N(G)UmTvdPErO7@Eww
zFd9)9_!~m==qeL^Zj!_23BgoqmQ$)y6(&vE0~5-cP}Z63_SV6m$E8BE)2PJJo-)wc
z9|b1%FZkR*kJ2b}N%{NmGh1g{!(*HJ-azidcW%D3b#{BQH88iC`Z9Oq?$qt6d;Hea
zPHtv5cdD5?)ykdTH1AKI-2TN*;nn8Mx!sw?=FDPi=JnR(8=L0N@OU$Q;{MFs$K#()
ze=_}P;gdpZeqne1d~^Q%pT?T=ueW9ruIXmlPPlf8aJ0BPzucT(Zq2{hnz<;I$m6}m
zi+`Q{eD*JeFLH1Hx$uXH&&F=e{pPvtx$WxiiDL6a@lL#T@y_+%&;4!WrOykSW;6Hp
z{h?#`ram5T4i$IO#c%diG;&2@vO)bZb#mba<F{zp9uR3aX=bh~vaYP~afo5v@4D~C
zUMSlfP(!X;4JhMbKro41m&$jhvcO4yQhDwX0^us#1Oow0Mgx_3k@%LrELop&Ap(KL
zccs&Vt1-vGOV3EIAIXn&R}x^!N7frA^YH_;_mjyR=Qpo!sW;zmX_F7Nj6Sl(zeX@U
zJUXbK-hS`D2&T3YnPSVfDNNU`L_T0Kpkc$GsWx2ud&D@)U-h6l$)KdN8nq!k2~(tB
uYQNRb=n}BVzRG?KpNFcVC|{)vMSC=il+k~qrLT=q<@BvzJwh-g2>%CLZOl{v


From 3f28b08e02ecbaa0154af9cb86befc86cf89bead Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sat, 9 May 2026 18:51:45 -0400
Subject: [PATCH 09/26] tests: add 3-level inheritance + self-cycle cases for
 EvalManager parser

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_eval_manager.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py
index 064463cc7..0b2356a41 100644
--- a/tests/test_eval_manager.py
+++ b/tests/test_eval_manager.py
@@ -47,6 +47,26 @@ def test_inheritance_chain():
     assert cfg["env"]["map_dir"] == "/tmp/hard_stop"
 
 
+def test_inheritance_three_levels():
+    # C inherits B inherits A. Each level overrides the one above.
+    sections = {
+        "A": {"interval": 100, "env.scenario_length": 91, "env.map_dir": "/A"},
+        "B": {"inherits": "A", "interval": 200, "env.scenario_length": 201},
+        "C": {"inherits": "B", "env.map_dir": "/C", "render": True},
+    }
+    cfg = _build_section_config("C", sections["C"], sections)
+    assert cfg["interval"] == 200, "B should win over A on interval"
+    assert cfg["env"]["scenario_length"] == 201, "B should win over A on scenario_length"
+    assert cfg["env"]["map_dir"] == "/C", "C should win over A and B on map_dir"
+    assert cfg["render"] is True, "C's own field"
+
+
+def test_inheritance_self_cycle_detected():
+    sections = {"a": {"inherits": "a"}}
+    with pytest.raises(ValueError, match="Cyclic"):
+        _build_section_config("a", sections["a"], sections)
+
+
 def test_inheritance_child_wins():
     sections = {
         "parent": {"interval": 250, "env.scenario_length": 201},

From 8b3519458d6053b4663c707e91b385539a6c0f99 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sat, 9 May 2026 23:37:33 -0400
Subject: [PATCH 10/26] [WIP] eval: render budget knob + random scenario
 selection per epoch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- eval.render_num_scenarios: explicit per-evaluator render budget;
  defaults to min(eval.num_scenarios, 3). Renders are expensive
  (mp4 encode + wandb upload) and shouldn't run at metric scale.
- Render path randomizes starting_map per epoch when not pinned,
  so successive renders show different bins from the dir instead
  of the first N alphabetically. Restores the old behavior from
  _render_driving_behaviours that the refactor lost.
- behaviors_defaults pins render_num_scenarios = 2 so 12 classes
  × 2 views × 2 scenarios = 48 mp4s/epoch (vs 1200 if it inherited
  num_scenarios=50).
- Test: render_num_scenarios is inheritable.

"Worst N" selection deferred — needs per-scenario scores piped out
of the metric rollout into the render call.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini              |  4 +++
 .../benchmark/evaluators/multi_scenario.py    | 30 ++++++++++++++++---
 tests/test_eval_manager.py                    | 20 +++++++++++++
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index d6d62a025..ebe977e02 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -258,6 +258,10 @@ env.init_mode = "create_all_valid"
 env.scenario_length = 201
 env.max_partner_observations = 32
 eval.num_scenarios = 50
+; Render budget per epoch (metrics still use the full num_scenarios).
+; Defaults to min(num_scenarios, 3); pin lower if even 3 mp4s × 12 classes
+; × 2 views is too much wandb traffic.
+eval.render_num_scenarios = 2
 
 [eval.behaviors_full_dir]
 inherits = "behaviors_defaults"
diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index 0703e6567..bd6c5672a 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -60,11 +60,19 @@ def _should_stop(self, args, infos_collected, steps) -> bool:
     def _render_pass(self, vecenv, policy, args) -> list:
         """One rollout per view, all writing mp4s to a single dir.
 
-        Builds a fresh single-worker env so frame capture is sequential
-        and starting_map_counter starts at 0 — the C-side ffmpeg-per-env
-        wiring assumes one bin at a time per process.
+        Builds a fresh single-worker env per view (C-side ffmpeg-per-env
+        wiring assumes one bin at a time per process). Render budget and
+        starting position are independent of the metric pass:
+
+          eval.render_num_scenarios — how many scenarios to render. Defaults
+              to min(eval.num_scenarios, 3). Always respected over
+              num_scenarios so renders stay cheap.
+          starting_map — randomized per render epoch so successive epochs
+              show different scenarios from the dir, not the same first-N
+              alphabetically. Set explicitly in env.* to pin.
         """
         import importlib
+        import random
 
         import pufferlib
 
@@ -84,6 +92,15 @@ def _render_pass(self, vecenv, policy, args) -> list:
         render_env_kwargs = dict(args["env"])
         render_env_kwargs["render_mode"] = "headless"
 
+        # Random starting map per render epoch — every epoch shows a
+        # different bin from the directory rather than the first N
+        # alphabetically. The user can pin by setting env.starting_map
+        # explicitly in the [eval.<name>] section.
+        if "starting_map" not in self.config.get("env", {}):
+            num_maps = int(render_env_kwargs.get("num_maps", 1))
+            if num_maps > 1:
+                render_env_kwargs["starting_map"] = random.randint(0, num_maps - 1)
+
         all_paths = []
         for view in self.render_views:
             view_idx = _VIEW_NAME_TO_IDX.get(view, 0)
@@ -116,7 +133,12 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
 
         device = args["train"]["device"]
         num_agents = vecenv.observation_space.shape[0]
-        num_scenarios = int(self.config.get("eval", {}).get("num_scenarios", 1))
+        # Render budget defaults to min(num_scenarios, 3) if not set explicitly.
+        # Renders are expensive (mp4 encode + wandb upload) so we don't want
+        # them at metric-pass scale.
+        eval_cfg = self.config.get("eval", {})
+        metric_count = int(eval_cfg.get("num_scenarios", 1))
+        num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3)))
         max_steps = args.get("render_max_steps") or int(args["env"].get("scenario_length", 91))
 
         saved_cwd = os.getcwd()
diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py
index 0b2356a41..b4ab620b7 100644
--- a/tests/test_eval_manager.py
+++ b/tests/test_eval_manager.py
@@ -136,6 +136,26 @@ def test_manager_from_config_skips_template_sections():
     assert "behaviors_defaults" not in names  # template, no `type` field
 
 
+def test_render_num_scenarios_inheritable():
+    # Behavior-style template specifies a small render budget; the per-class
+    # section inherits it without re-declaring.
+    sections = {
+        "defaults": {
+            "type": "behavior_class",
+            "interval": 250,
+            "eval.num_scenarios": 50,
+            "eval.render_num_scenarios": 2,
+        },
+        "hard_stop": {
+            "inherits": "defaults",
+            "env.map_dir": "/tmp/hard_stop",
+        },
+    }
+    cfg = _build_section_config("hard_stop", sections["hard_stop"], sections)
+    assert cfg["eval"]["num_scenarios"] == 50
+    assert cfg["eval"]["render_num_scenarios"] == 2
+
+
 def test_manager_unknown_type_raises():
     train_config = {"eval": {"foo": {"type": "totally_made_up"}}}
     with pytest.raises(ValueError, match="not registered"):

From 1537f566330a703133a00be92f9cca1da8768898 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sat, 9 May 2026 23:47:01 -0400
Subject: [PATCH 11/26] [WIP] eval: subprocess evals see fresh checkpoint, not
 stale resume path

Two coupled fixes:

  EvalManager:
    - Accepts run_id at construction. Used to locate the per-run
      models/ directory.
    - latest_checkpoint(env_name) walks data_dir/<env>_<run_id>/models/
      for the newest model_*.pt. Falls back to train_config.load_model_path
      if no checkpoints exist yet.
    - has_subprocess_evals_at(epoch) reports whether any enabled
      subprocess evaluator would fire at that epoch.
    - _run_subprocess uses latest_checkpoint instead of train_config.load_model_path.

  PuffeRL.evaluate:
    - Calls save_checkpoint() before maybe_run() if any subprocess
      evaluator would fire. Mirrors the old run_driving_behaviours
      _eval_in_subprocess flow.
    - train() passes logger.run_id when constructing the manager.

For all-inline configs (today's drive.ini default) this is a no-op.
Activates when an evaluator is flipped to mode=subprocess.

Tests: latest_checkpoint picks the newest by ctime, falls back to
load_model_path when no models exist; has_subprocess_evals_at fires
only on enabled subprocess evaluators at matching intervals.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/benchmark/manager.py | 43 ++++++++++++++++++++++----
 pufferlib/pufferl.py                 |  7 ++++-
 tests/test_eval_manager.py           | 46 ++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/pufferlib/ocean/benchmark/manager.py b/pufferlib/ocean/benchmark/manager.py
index 86dedaacc..9e79a50bc 100644
--- a/pufferlib/ocean/benchmark/manager.py
+++ b/pufferlib/ocean/benchmark/manager.py
@@ -20,6 +20,7 @@
 """
 
 import copy
+import glob
 import importlib
 import json
 import os
@@ -45,12 +46,15 @@
 
 
 class EvalManager:
-    def __init__(self, evaluators: list, train_config: dict):
+    def __init__(self, evaluators: list, train_config: dict, run_id: str = None):
         self.evaluators = evaluators
         self.train_config = train_config
+        # `run_id` is needed to resolve the latest checkpoint for subprocess
+        # evals. None is fine if no evaluator is mode=subprocess.
+        self.run_id = run_id
 
     @classmethod
-    def from_config(cls, train_config: dict) -> "EvalManager":
+    def from_config(cls, train_config: dict, run_id: str = None) -> "EvalManager":
         sections = _discover_eval_sections(train_config)
         evaluators = []
         for name, raw in sections.items():
@@ -66,7 +70,31 @@ def from_config(cls, train_config: dict) -> "EvalManager":
                     f"Known types: {sorted(EVALUATOR_REGISTRY.keys())}"
                 )
             evaluators.append(cls_for_type(name=name, config=cfg, train_config=train_config))
-        return cls(evaluators=evaluators, train_config=train_config)
+        return cls(evaluators=evaluators, train_config=train_config, run_id=run_id)
+
+    def has_subprocess_evals_at(self, epoch: int) -> bool:
+        """True if any enabled subprocess evaluator would fire at this epoch.
+        Training loop uses this to decide whether to save_checkpoint() before
+        calling maybe_run() — subprocesses load the checkpoint from disk."""
+        for ev in self.evaluators:
+            if not ev.enabled or ev.mode != "subprocess" or ev.interval <= 0:
+                continue
+            if epoch % ev.interval == 0:
+                return True
+        return False
+
+    def latest_checkpoint(self, env_name: str) -> str:
+        """Return the path to the most recent model_*.pt under the experiment
+        dir. Falls back to train_config['load_model_path'] if no checkpoints
+        have been written yet (e.g. resume-from path before first save).
+        Returns None if neither resolves."""
+        if self.run_id and self.train_config.get("data_dir"):
+            model_dir = os.path.join(self.train_config["data_dir"], f"{env_name}_{self.run_id}", "models")
+            if os.path.isdir(model_dir):
+                files = glob.glob(os.path.join(model_dir, "model_*.pt"))
+                if files:
+                    return max(files, key=os.path.getctime)
+        return self.train_config.get("load_model_path")
 
     def maybe_run(self, epoch: int, policy, env_name: str, logger=None, global_step=None) -> dict:
         """Called from the training loop. Runs every enabled evaluator
@@ -147,9 +175,12 @@ def _run_subprocess(self, ev: Evaluator, env_name: str, global_step) -> EvalResu
             "--out",
             str(out_path),
         ]
-        # Subprocess inherits the same checkpoint via train_config.load_model_path.
-        if self.train_config.get("load_model_path"):
-            cmd += ["--load-model-path", self.train_config["load_model_path"]]
+        # Subprocess loads the freshest checkpoint on disk. Caller (training
+        # loop) is responsible for save_checkpoint() before this fires —
+        # see has_subprocess_evals_at.
+        ckpt = self.latest_checkpoint(env_name)
+        if ckpt:
+            cmd += ["--load-model-path", ckpt]
         subprocess.run(cmd, check=True)
         with open(out_path) as f:
             payload = json.load(f)
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 7176c9759..e6b275d22 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -462,6 +462,11 @@ def train(self):
         # the manager fires any whose interval divides this epoch. See
         # docs/eval_unification.md for the design.
         if self._eval_manager is not None:
+            # Subprocess evals load the policy from disk. Save the latest
+            # checkpoint first so they see this epoch's weights, not the
+            # last save_checkpoint() from `checkpoint_interval`.
+            if self._eval_manager.has_subprocess_evals_at(self.epoch):
+                self.save_checkpoint()
             self._eval_manager.maybe_run(
                 epoch=self.epoch,
                 policy=self.uncompiled_policy,
@@ -1376,7 +1381,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, early_stop
 
     from pufferlib.ocean.benchmark.manager import EvalManager
 
-    pufferl._eval_manager = EvalManager.from_config(args)
+    pufferl._eval_manager = EvalManager.from_config(args, run_id=logger.run_id if logger else None)
 
     # Restore optimizer state + step counters when resuming from a checkpoint.
     # save_checkpoint writes models/model_<env>_<epoch>.pt and trainer_state.pt
diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py
index b4ab620b7..f009901df 100644
--- a/tests/test_eval_manager.py
+++ b/tests/test_eval_manager.py
@@ -160,3 +160,49 @@ def test_manager_unknown_type_raises():
     train_config = {"eval": {"foo": {"type": "totally_made_up"}}}
     with pytest.raises(ValueError, match="not registered"):
         EvalManager.from_config(train_config)
+
+
+def test_has_subprocess_evals_at():
+    train_config = {
+        "eval": {
+            "inline_one": {"type": "human_replay", "interval": 25, "mode": "inline"},
+            "subprocess_one": {"type": "human_replay", "interval": 100, "mode": "subprocess"},
+            "subprocess_disabled": {
+                "type": "human_replay",
+                "interval": 100,
+                "mode": "subprocess",
+                "enabled": False,
+            },
+        }
+    }
+    mgr = EvalManager.from_config(train_config)
+    assert mgr.has_subprocess_evals_at(epoch=100) is True  # subprocess_one fires
+    assert mgr.has_subprocess_evals_at(epoch=25) is False  # only inline at 25
+    assert mgr.has_subprocess_evals_at(epoch=50) is False  # nothing at 50
+
+
+def test_latest_checkpoint_finds_newest_pt(tmp_path):
+    import time
+
+    model_dir = tmp_path / "puffer_drive_run123" / "models"
+    model_dir.mkdir(parents=True)
+    p_old = model_dir / "model_puffer_drive_001.pt"
+    p_old.write_text("a")
+    time.sleep(0.05)
+    p_new = model_dir / "model_puffer_drive_002.pt"
+    p_new.write_text("b")
+
+    train_config = {"data_dir": str(tmp_path), "eval": {}}
+    mgr = EvalManager.from_config(train_config, run_id="run123")
+    assert mgr.latest_checkpoint("puffer_drive") == str(p_new)
+
+
+def test_latest_checkpoint_falls_back_to_load_model_path(tmp_path):
+    train_config = {
+        "data_dir": str(tmp_path),
+        "load_model_path": "/some/resume/path.pt",
+        "eval": {},
+    }
+    mgr = EvalManager.from_config(train_config, run_id="run123")
+    # No models dir exists → falls back to load_model_path
+    assert mgr.latest_checkpoint("puffer_drive") == "/some/resume/path.pt"

From 7af739a04960c2a87f422b2333077aeaac136766 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 00:06:26 -0400
Subject: [PATCH 12/26] =?UTF-8?q?[WIP]=20eval:=20revert=20goal=5Fadvance?=
 =?UTF-8?q?=5Fmode=20C=20knob=20=E2=80=94=20defer=20to=20a=20separate=20PR?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The unified-eval refactor doesn't actually need this knob, and pushing
it through the C struct + binding + drive.py at the same time bloats
this PR. Restoring the original `if (env->simulation_mode == SIMULATION_REPLAY)`
branch in c_step. The "promote implicit branches to explicit knobs"
audit is real work but lives in its own PR.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/drive/binding.c |  1 -
 pufferlib/ocean/drive/drive.h   | 20 ++++----------------
 pufferlib/ocean/drive/drive.py  | 15 ---------------
 3 files changed, 4 insertions(+), 32 deletions(-)

diff --git a/pufferlib/ocean/drive/binding.c b/pufferlib/ocean/drive/binding.c
index 0dd93a5e5..b2a4c20b1 100644
--- a/pufferlib/ocean/drive/binding.c
+++ b/pufferlib/ocean/drive/binding.c
@@ -1788,7 +1788,6 @@ static int my_init(Env *env, PyObject *args, PyObject *kwargs) {
     env->init_mode = (int)unpack(kwargs, "init_mode");
     env->control_mode = (int)unpack(kwargs, "control_mode");
     env->simulation_mode = (int)unpack(kwargs, "simulation_mode");
-    env->goal_advance_mode = (int)unpack(kwargs, "goal_advance_mode");
     env->reward_conditioning = (bool)unpack(kwargs, "reward_conditioning");
     env->reward_randomization = (bool)unpack(kwargs, "reward_randomization");
     env->compute_eval_metrics = (bool)unpack(kwargs, "compute_eval_metrics");
diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index 701eb1b9f..17ee20987 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -64,16 +64,6 @@
 #define SIMULATION_GIGAFLOW 0
 #define SIMULATION_REPLAY 1
 
-// Goal advance modes — chosen when the SDC reaches the last goal in its
-// sequence. REGENERATE recomputes a fresh set along the route (the
-// gigaflow training pattern). SATURATE leaves the goal queue at its
-// final state so the reached-goal condition won't fire again (the
-// replay-mode pattern, where regenerating would dereference NULL paths
-// for nuPlan bins without route info). Defaults to REGENERATE for
-// gigaflow and SATURATE for replay; the Python config layer chooses.
-#define GOAL_ADVANCE_REGENERATE 0
-#define GOAL_ADVANCE_SATURATE 1
-
 // Lane selection scoring
 #define LANE_SELECTION_DISTANCE_WEIGHT 0.7f
 #define LANE_SELECTION_HEADING_WEIGHT 0.3f
@@ -346,7 +336,6 @@ struct Drive {
     int init_mode;
     int control_mode;
     int simulation_mode;
-    int goal_advance_mode;
     int termination_mode;
     float inactive_agent_threshold;
     int reward_conditioning;
@@ -4877,11 +4866,10 @@ void c_step(Drive *env) {
             if (agent->current_goal_idx == env->num_target_waypoints) {
                 // Last goal reached
                 env->logs[i].num_goals_reached += 1;
-                if (env->goal_advance_mode == GOAL_ADVANCE_SATURATE) {
-                    // Leave current_goal_idx saturated so the reached-goal
-                    // condition won't fire again. Used by replay evals where
-                    // regenerating route-based goals on WOMD/nuPlan bins
-                    // would fail (path NULL or removed=1).
+                if (env->simulation_mode == SIMULATION_REPLAY) {
+                    // Replay mode: leave current_goal_idx saturated so the
+                    // reached-goal condition won't fire again. Re-generating
+                    // route-based goals on WOMD maps fails (removed=1).
                 } else {
                     compute_goals(env, agent_idx);
                 }
diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py
index 1d9c4d081..7dc537ca5 100644
--- a/pufferlib/ocean/drive/drive.py
+++ b/pufferlib/ocean/drive/drive.py
@@ -57,7 +57,6 @@ def __init__(
         action_type="discrete",
         dynamics_model="classic",
         simulation_mode="gigaflow",
-        goal_advance_mode=None,
         termination_mode=0,
         inactive_agent_threshold=0.4,
         buf=None,
@@ -230,19 +229,6 @@ def __init__(
         else:
             raise ValueError(f"simulation_mode must be one of 'gigaflow' or 'replay'. Got: {self.simulation_mode_str}")
 
-        # goal_advance_mode controls what happens when the SDC reaches the
-        # last goal in its sequence. None → auto-pick based on simulation_mode
-        # (gigaflow=regenerate, replay=saturate). Explicit values: "regenerate"
-        # or "saturate".
-        if goal_advance_mode is None:
-            self.goal_advance_mode = 1 if self.simulation_mode == 1 else 0
-        elif goal_advance_mode == "regenerate":
-            self.goal_advance_mode = 0
-        elif goal_advance_mode == "saturate":
-            self.goal_advance_mode = 1
-        else:
-            raise ValueError(f"goal_advance_mode must be one of 'regenerate' or 'saturate'. Got: {goal_advance_mode}")
-
         if self.control_mode_str == "control_vehicles":
             self.control_mode = 0
         elif self.control_mode_str == "control_agents":
@@ -401,7 +387,6 @@ def _env_init_kwargs(self, map_file, max_agents):
             "init_mode": self.init_mode,
             "control_mode": self.control_mode,
             "simulation_mode": self.simulation_mode,
-            "goal_advance_mode": self.goal_advance_mode,
             "reward_conditioning": self.reward_conditioning,
             "reward_randomization": self.reward_randomization,
             "compute_eval_metrics": self.compute_eval_metrics,

From 1153abefcc043c4e074c7ca83312fb4c05f6f4ba Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 00:08:00 -0400
Subject: [PATCH 13/26] =?UTF-8?q?[WIP]=20eval:=20tier-A=20tests=20?=
 =?UTF-8?q?=E2=80=94=20dispatch,=20info-shape,=20behavior=20cleanup,=20ove?=
 =?UTF-8?q?rride=20stack?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds 4 tests covering the regression-prone surface the parser-only
tests miss:

  test_maybe_run_dispatches_by_interval_and_enabled
    Stubs _run_one and verifies that maybe_run fires only enabled
    evaluators whose interval divides epoch. epoch=33 fires nothing,
    epoch=250 fires both 25-interval and 250-interval evaluators.

  test_flatten_infos_handles_shape_variations
    _flatten_infos must handle multi-worker (list of lists) AND
    PufferEnv (flat list) backends, plus None / empty entries —
    one bad isinstance check silently drops episode infos.

  test_behavior_class_cleanup_removes_symlink_dir
    Builds a real 5-bin map_dir, requests a 2-bin sample, verifies
    the tmp symlink dir gets created with 2 bins, then cleanup()
    removes it. tempfile.mkdtemp leftovers are a real footgun.

  test_eval_args_compose_train_section_and_clean_macro
    The full override stack: train baseline → section overrides →
    clean macro. Section beats baseline, explicit beats macro,
    untouched baseline survives.

25 tests total (was 21).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_eval_manager.py | 128 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 125 insertions(+), 3 deletions(-)

diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py
index f009901df..e9df7daa8 100644
--- a/tests/test_eval_manager.py
+++ b/tests/test_eval_manager.py
@@ -1,8 +1,9 @@
-"""Smoke tests for EvalManager config parsing.
+"""Smoke tests for EvalManager config parsing + dispatch.
 
 Doesn't load the full pufferl.py module (which pulls heavy training deps).
-Just verifies the inheritance + clean macro + dotted-key expansion logic
-behaves as the design doc says.
+Verifies parser correctness, dispatch gating, info-flattening shape
+handling, behavior-class symlink cleanup, and the train/section/macro
+override resolution stack.
 """
 
 import os
@@ -12,6 +13,8 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
+from pufferlib.ocean.benchmark.evaluators import EvalResult, Evaluator
+from pufferlib.ocean.benchmark.evaluators.behavior_class import BehaviorClassEvaluator
 from pufferlib.ocean.benchmark.manager import (
     CLEAN_EVAL_OVERRIDES,
     EvalManager,
@@ -206,3 +209,122 @@ def test_latest_checkpoint_falls_back_to_load_model_path(tmp_path):
     mgr = EvalManager.from_config(train_config, run_id="run123")
     # No models dir exists → falls back to load_model_path
     assert mgr.latest_checkpoint("puffer_drive") == "/some/resume/path.pt"
+
+
+# -- Tier A: dispatch + invariants -----------------------------------------
+
+
+def test_maybe_run_dispatches_by_interval_and_enabled(monkeypatch):
+    """maybe_run should fire only enabled evaluators whose interval divides epoch."""
+    train_config = {
+        "eval": {
+            "fires_at_25": {"type": "human_replay", "interval": 25},
+            "fires_at_250": {"type": "human_replay", "interval": 250},
+            "disabled": {"type": "human_replay", "interval": 25, "enabled": False},
+            "zero_interval": {"type": "human_replay", "interval": 0},
+        }
+    }
+    mgr = EvalManager.from_config(train_config)
+
+    calls = []
+
+    def fake_run(ev, *, policy, env_name, logger, global_step):
+        calls.append(ev.name)
+        return EvalResult(metrics={})
+
+    monkeypatch.setattr(mgr, "_run_one", fake_run)
+
+    mgr.maybe_run(epoch=25, policy=None, env_name="puffer_drive")
+    assert calls == ["fires_at_25"], "only the 25-interval evaluator fires at epoch 25"
+    calls.clear()
+
+    mgr.maybe_run(epoch=250, policy=None, env_name="puffer_drive")
+    assert sorted(calls) == ["fires_at_25", "fires_at_250"], "both fire at epoch 250"
+    calls.clear()
+
+    mgr.maybe_run(epoch=50, policy=None, env_name="puffer_drive")
+    assert calls == ["fires_at_25"], "only fires_at_25 at epoch 50; nothing else"
+    calls.clear()
+
+    mgr.maybe_run(epoch=33, policy=None, env_name="puffer_drive")
+    assert calls == [], "nothing fires when no interval divides the epoch"
+
+
+def test_flatten_infos_handles_shape_variations():
+    """_flatten_infos must accept both list-of-list (multi-worker) and
+    flat-list (PufferEnv) info shapes, plus None / empty entries."""
+
+    class _Stub(Evaluator):
+        type_name = "_stub_flatten"
+
+        def _should_stop(self, *args, **kwargs):
+            return True
+
+    s = _Stub("test", {}, {})
+    assert s._flatten_infos(None) == []
+    assert s._flatten_infos([]) == []
+    assert s._flatten_infos([None, None]) == []
+    assert s._flatten_infos([[], []]) == []
+
+    d1, d2, d3 = {"a": 1}, {"b": 2}, {"c": 3}
+    # Multi-worker backend: list of per-worker info lists
+    assert s._flatten_infos([[d1], [d2]]) == [d1, d2]
+    assert s._flatten_infos([[d1, d2], [d3]]) == [d1, d2, d3]
+    # PufferEnv backend: flat list of info dicts
+    assert s._flatten_infos([d1, d2]) == [d1, d2]
+
+
+def test_behavior_class_cleanup_removes_symlink_dir(tmp_path):
+    """BehaviorClassEvaluator builds a tmp symlink dir when sampling.
+    cleanup() must remove it; otherwise we accumulate leftovers."""
+    map_dir = tmp_path / "bins"
+    map_dir.mkdir()
+    for i in range(5):
+        (map_dir / f"map_{i}.bin").write_text("a")
+
+    config = {
+        "type": "behavior_class",
+        "env": {"map_dir": str(map_dir)},
+        "eval": {"num_scenarios": 2},
+    }
+    ev = BehaviorClassEvaluator("test_class", config, train_config={})
+
+    overrides = ev.env_overrides()
+    sampled = overrides["map_dir"]
+    assert sampled != str(map_dir), "sampling should redirect to a tmp dir"
+    assert os.path.isdir(sampled)
+    assert len([f for f in os.listdir(sampled) if f.endswith(".bin")]) == 2
+
+    ev.cleanup()
+    assert not os.path.exists(sampled), "tmp dir should be gone after cleanup"
+    assert ev._sampled_dir is None
+
+
+def test_eval_args_compose_train_section_and_clean_macro():
+    """_build_eval_args must fold train_config['env'] (baseline) +
+    section overrides + clean macro correctly. Section beats baseline,
+    explicit beats clean macro, baseline survives when not overridden."""
+    train_config = {
+        "env": {
+            "lane_segment_dropout": 0.5,  # training perturbation
+            "scenario_length": 91,
+            "num_agents": 1024,  # only present in train baseline
+        },
+        "train": {"seed": 42, "device": "cpu"},
+        "eval": {
+            "validation": {
+                "type": "multi_scenario",
+                "interval": 25,
+                "env.scenario_length": 201,  # section overrides baseline
+                # clean=true (default) → lane_segment_dropout zeroed by macro
+                # num_agents not specified → falls through to train baseline
+            },
+        },
+    }
+    mgr = EvalManager.from_config(train_config)
+    ev = mgr.evaluators[0]
+    args = mgr._build_eval_args(ev, env_name="puffer_drive", global_step=0)
+
+    assert args["env"]["scenario_length"] == 201, "section override wins"
+    assert args["env"]["lane_segment_dropout"] == 0.0, "clean macro applied"
+    assert args["env"]["num_agents"] == 1024, "train baseline preserved"

From 1cfbcc3658eacebaabe1bbb00d3d6471ae4ba662 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 00:15:45 -0400
Subject: [PATCH 14/26] =?UTF-8?q?drive.ini:=20bump=20validation=5Fgigaflow?=
 =?UTF-8?q?=20interval=2025=20=E2=86=92=20250?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aligns with the heavier-eval cadence the rest of the new eval
sections use. 25-epoch interval was too aggressive for inline
gigaflow validation given the per-pass setup cost.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index ebe977e02..b49cc1de8 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -216,7 +216,7 @@ render_map = none
 [eval.validation_gigaflow]
 type = "multi_scenario"
 enabled = true
-interval = 25
+interval = 250
 mode = "inline"
 clean = true
 render = false

From c2fa176400df91e9dc2fac5adf33e3975c85b88f Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 00:29:38 -0400
Subject: [PATCH 15/26] [WIP] eval: log eval_seconds per evaluator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Time the rollout in Evaluator.rollout (base class) and inject
`eval_seconds` into the returned metrics dict. Manager's _log
posts it to wandb under {ev.name}/eval_seconds — wall-clock cost
per evaluator becomes a first-class panel.

Refactor WOSACEvaluator to override _run_rollout_loop instead of
rollout — now WOSAC also benefits from the timing without code
duplication.

Test: stub evaluator with a forced 20ms floor; verifies eval_seconds
lands in the result and the inner metrics survive.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/benchmark/evaluators/base.py  | 11 ++++++++---
 pufferlib/ocean/benchmark/evaluators/wosac.py |  7 +++----
 tests/test_eval_manager.py                    | 19 +++++++++++++++++++
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py
index 77a7ad95e..08406b5cd 100644
--- a/pufferlib/ocean/benchmark/evaluators/base.py
+++ b/pufferlib/ocean/benchmark/evaluators/base.py
@@ -1,5 +1,6 @@
 """Evaluator base class + default rollout loop + EvalResult dataclass."""
 
+import time
 from dataclasses import dataclass, field
 from typing import ClassVar
 
@@ -61,12 +62,16 @@ def vec_overrides(self) -> dict:
     def rollout(self, vecenv, policy, args) -> EvalResult:
         """Default rollout: reset → step → collect infos → aggregate.
 
-        Subclasses tune behavior via the hooks below. Override this
-        method directly only if the loop shape itself needs to differ
-        (e.g. per-scene multi-rollout patterns).
+        Times the inner work and adds `eval_seconds` to metrics so wandb
+        panels show wall-clock cost per evaluator. Subclasses tune
+        behavior by overriding `_run_rollout_loop` (and optionally
+        `_render_pass`); only override this method if the loop shape
+        itself needs to differ.
         """
+        t0 = time.time()
         metrics = self._run_rollout_loop(vecenv, policy, args)
         frames = self._render_pass(vecenv, policy, args) if self.render else []
+        metrics["eval_seconds"] = float(time.time() - t0)
         return EvalResult(metrics=metrics, frames=frames)
 
     def _run_rollout_loop(self, vecenv, policy, args) -> dict:
diff --git a/pufferlib/ocean/benchmark/evaluators/wosac.py b/pufferlib/ocean/benchmark/evaluators/wosac.py
index 8733c8a2a..b8a7d82af 100644
--- a/pufferlib/ocean/benchmark/evaluators/wosac.py
+++ b/pufferlib/ocean/benchmark/evaluators/wosac.py
@@ -8,7 +8,7 @@
 
 from typing import ClassVar
 
-from pufferlib.ocean.benchmark.evaluators.base import EvalResult, Evaluator
+from pufferlib.ocean.benchmark.evaluators.base import Evaluator
 
 
 class WOSACEvaluator(Evaluator):
@@ -25,7 +25,7 @@ def env_overrides(self) -> dict:
         env.update(self.config.get("env", {}))
         return env
 
-    def rollout(self, vecenv, policy, args) -> EvalResult:
+    def _run_rollout_loop(self, vecenv, policy, args) -> dict:
         # Inner class pulls pandas/matplotlib — keep the import inside the
         # rollout so the wrapper class can be imported in environments
         # that don't have those (e.g. unit-test smoke envs).
@@ -38,5 +38,4 @@ def rollout(self, vecenv, policy, args) -> EvalResult:
         results["total_num_agents"] = float(df["num_agents_per_scene"].sum())
         results["total_unique_scenarios"] = float(df.index.unique().shape[0])
         results["realism_meta_score_std"] = float(df["realism_meta_score"].std())
-        results = {k: (float(v) if hasattr(v, "item") else v) for k, v in results.items()}
-        return EvalResult(metrics=results, frames=[])
+        return {k: (float(v) if hasattr(v, "item") else v) for k, v in results.items()}
diff --git a/tests/test_eval_manager.py b/tests/test_eval_manager.py
index e9df7daa8..33a1fb1e1 100644
--- a/tests/test_eval_manager.py
+++ b/tests/test_eval_manager.py
@@ -300,6 +300,25 @@ def test_behavior_class_cleanup_removes_symlink_dir(tmp_path):
     assert ev._sampled_dir is None
 
 
+def test_rollout_records_eval_seconds():
+    """Every rollout's metrics dict should include `eval_seconds` so wandb
+    panels show wall-clock cost per evaluator."""
+    import time as _time
+
+    class _Stub(Evaluator):
+        type_name = "_stub_timing"
+
+        def _run_rollout_loop(self, vecenv, policy, args):
+            _time.sleep(0.02)  # forced floor so the recorded time is > 0
+            return {"some_metric": 1.5}
+
+    s = _Stub("test", {}, {})
+    result = s.rollout(vecenv=None, policy=None, args={})
+    assert "eval_seconds" in result.metrics
+    assert result.metrics["eval_seconds"] >= 0.02
+    assert result.metrics["some_metric"] == 1.5
+
+
 def test_eval_args_compose_train_section_and_clean_macro():
     """_build_eval_args must fold train_config['env'] (baseline) +
     section overrides + clean macro correctly. Section beats baseline,

From 38a9c05523b3c1aeeebca267b30f9cf8bcafe223 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 00:43:22 -0400
Subject: [PATCH 16/26] drive.py: fix missing goal_radius in resample-time
 binding.shared
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The eval-mode resample path at drive.py:447 was missing goal_radius
in its kwargs to binding.shared, while the initial-spawn call at
line 293 has it. binding.shared's C side (binding.c:1545) requires
goal_radius via unpack(), so the resample crashes with
"Missing required keyword argument 'goal_radius'" once a scenario
batch completes and a new one is requested.

Latent bug — never triggered because the legacy [eval] section had
multi_scenario_eval = False as the default, so the eval rollout
path that triggers resample never fired in production. The new
EvalManager flips multi_scenario_eval-equivalent ([eval.validation_gigaflow])
on by default, surfacing the crash.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/drive/drive.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pufferlib/ocean/drive/drive.py b/pufferlib/ocean/drive/drive.py
index 7dc537ca5..e417e3473 100644
--- a/pufferlib/ocean/drive/drive.py
+++ b/pufferlib/ocean/drive/drive.py
@@ -458,6 +458,7 @@ def step(self, actions):
                     min_agents_per_env=self.min_agents_per_env,
                     max_agents_per_env=self.max_agents_per_env,
                     num_eval_scenarios=self.current_num_eval_scenarios,  # Use the dynamic size here
+                    goal_radius=self.goal_radius,
                 )
 
                 # In eval mode, don't wrap counter - allows termination condition to work correctly

From 6514e8d6a8081bf6f87c1519ee8394a1fa1ab55b Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 00:48:28 -0400
Subject: [PATCH 17/26] drive.ini: merge validation_gigaflow +
 validation_gigaflow_render

Single section now does both: 250-scenario metric pass + 5-scenario
render pass via the render_num_scenarios knob. The split made sense
before render became a per-section flag; it's redundant now.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index b49cc1de8..320184608 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -219,7 +219,11 @@ enabled = true
 interval = 250
 mode = "inline"
 clean = true
-render = false
+; Single rollout: 250-scenario metric pass + a 5-scenario render pass
+; for the wandb panel. render_num_scenarios decouples the render budget
+; from the metric pass so videos stay cheap.
+render = true
+render_views = ["sim_state", "bev"]
 env.simulation_mode = "gigaflow"
 env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
 env.num_maps = 8
@@ -229,14 +233,7 @@ env.max_agents_per_env = 50
 env.scenario_length = 3000
 env.resample_frequency = 3000
 eval.num_scenarios = 250
-
-[eval.validation_gigaflow_render]
-inherits = "validation_gigaflow"
-enabled = true
-interval = 250
-render = true
-render_views = ["sim_state", "bev"]
-eval.num_scenarios = 5
+eval.render_num_scenarios = 5
 
 ; ---------------------------------------------------------------------------
 ; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each

From 2b074149abd765409e3035725ee371404d5e6e7f Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 00:54:04 -0400
Subject: [PATCH 18/26] [WIP] eval: fix render-path vec.make signature for
 PufferEnv backend

PufferEnv backend in pufferlib.vector.make treats env_creator as a
single callable and passes env_args/env_kwargs to it directly (line
697 in vector.py). Multiprocessing/Serial backends expect lists.
Render must use PufferEnv (one ffmpeg pipe per env), so pass a
single creator + dict, not lists.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../ocean/benchmark/evaluators/multi_scenario.py     | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index bd6c5672a..76f3f0ab0 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -106,14 +106,16 @@ def _render_pass(self, vecenv, policy, args) -> list:
             view_idx = _VIEW_NAME_TO_IDX.get(view, 0)
             view_suffix = "" if view == "sim_state" else f"_{view}"
 
+            # PufferEnv backend treats the creator as a single callable and
+            # passes env_args/env_kwargs to it directly (not as per-env lists).
+            # The Multiprocessing/Serial backends expect lists; we don't use
+            # those here because EGL render assumes one ffmpeg pipe per env.
             vec = pufferlib.vector.make(
-                [make_env],
-                env_args=[[]],
-                env_kwargs=[render_env_kwargs],
+                make_env,
+                env_args=[],
+                env_kwargs=render_env_kwargs,
                 backend="PufferEnv",
                 num_envs=1,
-                num_workers=1,
-                batch_size=1,
             )
             target = vec if not hasattr(vec, "envs") else vec.envs[0]
             internal = getattr(target, "num_envs", 1)

From 9b7f9d64b8385d85baeca5e45c58577a64c3b647 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 01:14:58 -0400
Subject: [PATCH 19/26] [WIP] eval: validation_gigaflow runs 1 scenario per
 map, not 250
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 8-map carla dir doesn't need 250 scenarios — gigaflow's C-side eval
already creates one internal env per scenario (capped at num_scenarios)
and steps them in a single batched rollout, so num_scenarios=8 covers
every map exactly once in parallel. Drop num_agents 512→400 to fill
exactly 8 × 50 slots with no wasted capacity.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index 320184608..ba3a57231 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -219,20 +219,22 @@ enabled = true
 interval = 250
 mode = "inline"
 clean = true
-; Single rollout: 250-scenario metric pass + a 5-scenario render pass
-; for the wandb panel. render_num_scenarios decouples the render budget
-; from the metric pass so videos stay cheap.
+; One rollout per map (8 carla maps). C-side gigaflow eval cycles maps
+; sequentially within one PufferEnv worker: env_count = min(ceil(num_agents
+; / max_per_env), num_scenarios), so 8 internal envs (one per map) step
+; in parallel via the batched C kernel — no multiprocessing needed.
+; render_num_scenarios decouples the render budget so videos stay cheap.
 render = true
 render_views = ["sim_state", "bev"]
 env.simulation_mode = "gigaflow"
 env.map_dir = "pufferlib/resources/drive/binaries/carla_py123d"
 env.num_maps = 8
-env.num_agents = 512
+env.num_agents = 400
 env.min_agents_per_env = 50
 env.max_agents_per_env = 50
 env.scenario_length = 3000
 env.resample_frequency = 3000
-eval.num_scenarios = 250
+eval.num_scenarios = 8
 eval.render_num_scenarios = 5
 
 ; ---------------------------------------------------------------------------

From 5b7997eda830398fd2cb620246ec6634978a33e7 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 01:22:12 -0400
Subject: [PATCH 20/26] [WIP] eval: cap render clip length, decouple from
 metric-pass scenario_length
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Render path was falling back to env.scenario_length (3000 = 100s mp4 at
30fps) because render_max_steps wasn't a real config knob. Two fixes:

  1. multi_scenario.py: read eval.render_max_steps from args["eval"]
     where _build_eval_args puts evaluator-private fields, and default
     to 91 (~3 sec) — not scenario_length, which is the metric pass.

  2. drive.ini: set eval.render_max_steps = 91 on validation_gigaflow
     explicitly so the comment + value document the intent.

EGL render is ~3 fps wall-clock at 1080p, so 91 × 5 × 2 ≈ 5 min/render
pass instead of 28 min when defaulting to scenario_length=3000.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini                       | 3 +++
 pufferlib/ocean/benchmark/evaluators/multi_scenario.py | 6 +++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index ba3a57231..c464c4c2a 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -236,6 +236,9 @@ env.scenario_length = 3000
 env.resample_frequency = 3000
 eval.num_scenarios = 8
 eval.render_num_scenarios = 5
+; ~3 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p,
+; so 91 steps × 5 scenarios × 2 views ≈ 5 min — well below the eval cadence.
+eval.render_max_steps = 91
 
 ; ---------------------------------------------------------------------------
 ; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each
diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index 76f3f0ab0..f69c17ff3 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -141,7 +141,11 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
         eval_cfg = self.config.get("eval", {})
         metric_count = int(eval_cfg.get("num_scenarios", 1))
         num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3)))
-        max_steps = args.get("render_max_steps") or int(args["env"].get("scenario_length", 91))
+        # Render-clip length: independent of scenario_length (which is the
+        # metric-pass length). At 30 fps, 91 steps = ~3s mp4. Per-step EGL
+        # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping
+        # this small directly bounds the render-pass runtime.
+        max_steps = int(args.get("eval", {}).get("render_max_steps", 91))
 
         saved_cwd = os.getcwd()
         os.chdir(out_dir)

From ec0698a8762dc03de16f6f05b178f3d94852a729 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 01:25:46 -0400
Subject: [PATCH 21/26] =?UTF-8?q?[WIP]=20eval:=20bump=20render=5Fmax=5Fste?=
 =?UTF-8?q?ps=20default=2091=20=E2=86=92=20300=20(10s=20clip=20vs=203s)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini                       | 6 +++---
 pufferlib/ocean/benchmark/evaluators/multi_scenario.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index c464c4c2a..de05defa0 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -236,9 +236,9 @@ env.scenario_length = 3000
 env.resample_frequency = 3000
 eval.num_scenarios = 8
 eval.render_num_scenarios = 5
-; ~3 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p,
-; so 91 steps × 5 scenarios × 2 views ≈ 5 min — well below the eval cadence.
-eval.render_max_steps = 91
+; ~10 sec mp4 per scenario at 30 fps. Render is ~3 fps wall-clock at 1080p,
+; so 300 steps × 5 scenarios × 2 views ≈ 17 min — below the eval cadence.
+eval.render_max_steps = 300
 
 ; ---------------------------------------------------------------------------
 ; Driving-behaviour evaluation: nuPlan scenes labeled by scene type. Each
diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index f69c17ff3..8f1777320 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -142,10 +142,10 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
         metric_count = int(eval_cfg.get("num_scenarios", 1))
         num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3)))
         # Render-clip length: independent of scenario_length (which is the
-        # metric-pass length). At 30 fps, 91 steps = ~3s mp4. Per-step EGL
+        # metric-pass length). At 30 fps, 300 steps = ~10s mp4. Per-step EGL
         # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping
         # this small directly bounds the render-pass runtime.
-        max_steps = int(args.get("eval", {}).get("render_max_steps", 91))
+        max_steps = int(args.get("eval", {}).get("render_max_steps", 300))
 
         saved_cwd = os.getcwd()
         os.chdir(out_dir)

From cf955d9c3a7f0c846a8573aa93d765350abf0376 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 01:34:55 -0400
Subject: [PATCH 22/26] [WIP] eval: default to PufferEnv inline, keep
 Multiprocessing opt-in
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The old MultiScenarioEvaluator forced Multiprocessing + async_reset by
default, which (a) added fork/IPC startup cost without throughput gain
at our scale, (b) broke render (one ffmpeg pipe per env requires
single-process), and (c) couldn't construct the vec env at all because
the manager passed the PufferEnv-shaped single creator to the list-shaped
backend dispatch. Three changes:

  1. multi_scenario.py: drop the vec_overrides() and _initial_reset()
     overrides that pinned Multiprocessing — inherit base-class PufferEnv
     + sync reset.

  2. manager.py:_run_inline: branch on backend so Multiprocessing remains
     a valid opt-in via [eval.<name>.vec] backend = "Multiprocessing".
     Useful for memory-bound replay sweeps, hetero-config evals, or
     async overlap on long rollouts — not needed for the 8-map carla
     validation eval that's the hot path today.

  3. The 8-map validation eval now spins up one Drive with 8 internal
     envs in-process, no fork.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../benchmark/evaluators/multi_scenario.py    | 19 ++++-------
 pufferlib/ocean/benchmark/manager.py          | 32 +++++++++++++------
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index 8f1777320..1c161c5db 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -17,12 +17,6 @@
 class MultiScenarioEvaluator(Evaluator):
     type_name: ClassVar[str] = "multi_scenario"
 
-    def vec_overrides(self) -> dict:
-        # Multi-worker by default for throughput. Override via [eval.<name>.vec].
-        backend = self.train_config.get("vec", {}).get("backend", "PufferEnv")
-        num_envs = int(self.config.get("vec", {}).get("num_envs", 1))
-        return {"backend": backend, "num_envs": num_envs}
-
     def env_overrides(self) -> dict:
         env = {
             "eval_mode": 1,
@@ -32,13 +26,14 @@ def env_overrides(self) -> dict:
         env.update(self.config.get("env", {}))
         return env
 
-    # -- Loop hooks --
+    # vec_overrides + _initial_reset use the base-class defaults: PufferEnv
+    # backend with num_envs=1 and a sync reset. Drive's C side already
+    # allocates `min(ceil(num_agents/max_per_env), num_eval_scenarios)`
+    # internal envs and steps them in one batched kernel call, so we get
+    # full per-map parallelism without paying multi-process fork/IPC cost.
+    # Override [eval.<name>.vec] in the ini if you genuinely need workers.
 
-    def _initial_reset(self, vecenv, args):
-        # Multi-worker async reset gives us the parallel-throughput path.
-        vecenv.async_reset(args.get("seed", 42))
-        ob, _, _, _, _, _, _ = vecenv.recv()
-        return ob
+    # -- Loop hooks --
 
     def _maybe_reset_lstm(self, state, steps, args):
         # Reset between scenarios — gigaflow's auto-resample fires at the
diff --git a/pufferlib/ocean/benchmark/manager.py b/pufferlib/ocean/benchmark/manager.py
index 9e79a50bc..2ec021c08 100644
--- a/pufferlib/ocean/benchmark/manager.py
+++ b/pufferlib/ocean/benchmark/manager.py
@@ -139,18 +139,30 @@ def _run_inline(self, ev: Evaluator, policy, env_name: str, global_step) -> Eval
         make_env = env_module.env_creator(env_name)
 
         vec_kwargs = ev.vec_overrides()
+        backend = vec_kwargs.get("backend", "PufferEnv")
         num_envs = int(vec_kwargs.get("num_envs", 1))
-        env_kwargs_list = [args["env"] for _ in range(num_envs)]
-        env_creators = [make_env] * num_envs
-        env_args_list = [[]] * num_envs
 
-        vec_call_kwargs = dict(vec_kwargs)
-        vec_call_kwargs.setdefault("num_workers", num_envs)
-        vec_call_kwargs.setdefault("batch_size", num_envs)
-
-        vecenv = pufferlib.vector.make(
-            env_creators, env_args=env_args_list, env_kwargs=env_kwargs_list, **vec_call_kwargs
-        )
+        # PufferEnv is the default: Drive's C kernel batches all internal
+        # envs in one call so we get per-map parallelism without paying
+        # fork/IPC cost, and render shares the single ffmpeg pipeline.
+        # Multiprocessing is opt-in via [eval.<name>.vec] backend = ...
+        # for evals that genuinely need it (memory-split for big replay
+        # sweeps, hetero scenarios, async overlap on long rollouts).
+        # The two backends have incompatible call shapes; branch here.
+        if backend == "PufferEnv":
+            vecenv = pufferlib.vector.make(
+                make_env, env_args=[], env_kwargs=args["env"], backend=backend, num_envs=num_envs
+            )
+        else:
+            vec_call_kwargs = dict(vec_kwargs)
+            vec_call_kwargs.setdefault("num_workers", num_envs)
+            vec_call_kwargs.setdefault("batch_size", num_envs)
+            vecenv = pufferlib.vector.make(
+                [make_env] * num_envs,
+                env_args=[[]] * num_envs,
+                env_kwargs=[args["env"] for _ in range(num_envs)],
+                **vec_call_kwargs,
+            )
         try:
             res = ev.rollout(vecenv, policy, args)
         finally:

From 725475f0c57291f01063b871c6f1991e57122d1b Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 01:46:23 -0400
Subject: [PATCH 23/26] [WIP] eval: render returns only this-pass mp4s; nuplan
 paths use real path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes for the multi-evaluator smoke test:

  1. multi_scenario._render_view: glob filter excludes mp4s that already
     existed in out_dir before this pass. The dir is shared across epochs
     and across views, so a bare `out_dir.glob('*.mp4')` was returning
     every mp4 from prior render passes too — turning epoch 12510's
     16-mp4 render into a 32-mp4 wandb upload (the 16 from epoch 12505
     were still on disk).

  2. drive.ini: replace `/scratch/$USER/data/nuplan/...` with the literal
     path. configparser doesn't expand env vars, and the manager doesn't
     either, so behavior evals were crashing on FileNotFoundError trying
     to open `/scratch/$USER/data/...`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini              | 24 +++++++++----------
 .../benchmark/evaluators/multi_scenario.py    |  7 +++++-
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index de05defa0..567ff6098 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -269,73 +269,73 @@ eval.render_num_scenarios = 2
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/nuplan_mini_train_bins"
+env.map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins"
 
 [eval.behaviors_hard_stop]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/hard_stop"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop"
 
 [eval.behaviors_highway_straight]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/highway_straight"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight"
 
 [eval.behaviors_lane_change]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/lane_change"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change"
 
 [eval.behaviors_merge]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/merge"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge"
 
 [eval.behaviors_parked_cars]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/parked_cars"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars"
 
 [eval.behaviors_roundabout]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/roundabout"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout"
 
 [eval.behaviors_stopped_traffic]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/stopped_traffic"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic"
 
 [eval.behaviors_traffic_light_green]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_green"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green"
 
 [eval.behaviors_traffic_light_stop]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/traffic_light_stop"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop"
 
 [eval.behaviors_unprotected_left]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_left"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left"
 
 [eval.behaviors_unprotected_right]
 inherits = "behaviors_defaults"
 type = "behavior_class"
 enabled = true
-env.map_dir = "/scratch/$USER/data/nuplan/categories_v021/unprotected_right"
+env.map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right"
 
 ; ---------------------------------------------------------------------------
 ; Optional: WOSAC realism eval. Off by default.
diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index 1c161c5db..054e73cd4 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -144,6 +144,11 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
 
         saved_cwd = os.getcwd()
         os.chdir(out_dir)
+        # Snapshot existing mp4s so we only return files written in this
+        # pass — out_dir is shared across epochs (and across views), so
+        # globbing the dir at the end would re-pick up every mp4 from prior
+        # render passes and make _log think we rendered far more than we did.
+        existing = set(out_dir.glob("*.mp4"))
         try:
             state = self._init_lstm_state(num_agents, policy, device, args)
             scenarios_processed = 0
@@ -173,7 +178,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
         finally:
             os.chdir(saved_cwd)
 
-        return sorted(p for p in out_dir.glob("*.mp4"))
+        return sorted(p for p in out_dir.glob("*.mp4") if p not in existing)
 
 
 _VIEW_NAME_TO_IDX = {

From 38eec92094a041c19caa002bb7a9bb212db9c9a4 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 01:47:48 -0400
Subject: [PATCH 24/26] [WIP] eval: stamp global_step into render mp4 filenames
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously `{scenario_id}{view_suffix}.mp4` — each render epoch
overwrote the previous epoch's mp4 in place, making the prior fix's
existing-file snapshot filter incorrect (filter would exclude the
freshly-written file because Path equality matches the prior path).

Now `{scenario_id}_step{N}{view_suffix}.mp4` so:
  - successive eval epochs produce distinct mp4s (no overwrites).
  - wandb's render carousel shows one entry per epoch, letting the
    user watch policy evolve over training.
  - the return-paths glob is exact: just files matching this step's
    pattern, no snapshot-filtering trickery.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../benchmark/evaluators/multi_scenario.py    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index 054e73cd4..5a0861dd0 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -96,10 +96,17 @@ def _render_pass(self, vecenv, policy, args) -> list:
             if num_maps > 1:
                 render_env_kwargs["starting_map"] = random.randint(0, num_maps - 1)
 
+        # Stamp the training step into the filename so successive epochs
+        # produce distinct mp4s (Town01.xodr_step25100000_bev.mp4) instead
+        # of overwriting in place. wandb then shows one entry per epoch
+        # in the render carousel — useful for watching policy evolve over
+        # training. global_step falls back to 0 for ad-hoc CLI runs.
+        step_suffix = f"_step{int(args.get('global_step') or 0)}"
+
         all_paths = []
         for view in self.render_views:
             view_idx = _VIEW_NAME_TO_IDX.get(view, 0)
-            view_suffix = "" if view == "sim_state" else f"_{view}"
+            view_suffix = step_suffix + ("" if view == "sim_state" else f"_{view}")
 
             # PufferEnv backend treats the creator as a single callable and
             # passes env_args/env_kwargs to it directly (not as per-env lists).
@@ -144,11 +151,13 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
 
         saved_cwd = os.getcwd()
         os.chdir(out_dir)
-        # Snapshot existing mp4s so we only return files written in this
-        # pass — out_dir is shared across epochs (and across views), so
-        # globbing the dir at the end would re-pick up every mp4 from prior
-        # render passes and make _log think we rendered far more than we did.
-        existing = set(out_dir.glob("*.mp4"))
+        # Filename pattern for this pass: each scenario writes
+        # `{scenario_id}_step{N}{view_suffix}.mp4`. Globbing by step suffix
+        # picks up only this-pass mp4s and ignores accumulated files from
+        # prior epochs that share the dir. Source of truth for the suffix
+        # is _render_pass (it set_video_suffix'd each env before calling
+        # us), so we read it back from any active env to keep them aligned.
+        step_glob = f"*_step{int(args.get('global_step') or 0)}*.mp4"
         try:
             state = self._init_lstm_state(num_agents, policy, device, args)
             scenarios_processed = 0
@@ -178,7 +187,7 @@ def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir:
         finally:
             os.chdir(saved_cwd)
 
-        return sorted(p for p in out_dir.glob("*.mp4") if p not in existing)
+        return sorted(out_dir.glob(step_glob))
 
 
 _VIEW_NAME_TO_IDX = {

From 0e83577bcd99281fc339ad58e5ae18a92e4ae4c7 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 02:01:01 -0400
Subject: [PATCH 25/26] [WIP] eval: lift render to base, behaviors now render
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Behavior-class evaluators previously had render=true in config but no
_render_pass implementation, so the videos silently never appeared.
Three changes:

  1. base.py: lift _render_pass + _render_view from MultiScenarioEvaluator
     into Evaluator. The render loop is generic — fresh PufferEnv with
     render_mode=headless, ffmpeg pipe per active env per view, mp4s
     stamped with global_step in the filename.

  2. base.py: add _render_env_overrides hook so subclasses can tweak the
     render env (default = metric env + render_mode=headless). Render
     loop now caps internal-env render count at eval.render_num_scenarios
     instead of always rendering the full batch (the C kernel still steps
     the full batch — just fewer ffmpeg pipes). Drops the dead
     batch_size_eval write.

  3. multi_scenario.py: keeps the random-starting_map override (its only
     real difference from the default render path) — everything else is
     now inherited.

Result: behavior_class and human_replay inherit a working render path.
For 12 behavior classes with render_num_scenarios=2, render is bounded
by ~3 fps × 300 steps × 2 views ≈ 3.5 min/class.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/benchmark/evaluators/base.py  | 131 ++++++++++++-
 .../benchmark/evaluators/multi_scenario.py    | 173 ++----------------
 2 files changed, 141 insertions(+), 163 deletions(-)

diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py
index 08406b5cd..999102ca6 100644
--- a/pufferlib/ocean/benchmark/evaluators/base.py
+++ b/pufferlib/ocean/benchmark/evaluators/base.py
@@ -159,6 +159,133 @@ def _aggregate_infos(self, infos: list) -> dict:
                 out[k] = float(np.mean(vals))
         return out
 
+    # -- Render (default EGL → ffmpeg mp4 pipeline) ----------------------
+
     def _render_pass(self, vecenv, policy, args) -> list:
-        """Render hook. Subclasses that support frame capture override this."""
-        return []
+        """Build a fresh PufferEnv with `render_mode=headless`, render one
+        clip per (scenario, view), return mp4 paths. Returns [] for non-egl
+        backends. Subclasses customize the render env via `_render_env_overrides`.
+        """
+        backend = args.get("render_backend", "egl")
+        if backend != "egl":
+            return []
+
+        import importlib
+        from pathlib import Path
+
+        import pufferlib
+
+        out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4"
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        package = args.get("package", "ocean")
+        module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}"
+        env_module = importlib.import_module(module_name)
+        make_env = env_module.env_creator(args["env_name"])
+
+        render_env_kwargs = self._render_env_overrides(args)
+        # Stamp the training step into the filename so successive epochs
+        # produce distinct mp4s and wandb's render carousel shows policy
+        # evolution. global_step falls back to 0 for ad-hoc CLI runs.
+        step_suffix = f"_step{int(args.get('global_step') or 0)}"
+
+        all_paths = []
+        for view in self.render_views:
+            view_idx = _VIEW_NAME_TO_IDX.get(view, 0)
+            view_suffix = step_suffix + ("" if view == "sim_state" else f"_{view}")
+
+            vec = pufferlib.vector.make(
+                make_env,
+                env_args=[],
+                env_kwargs=render_env_kwargs,
+                backend="PufferEnv",
+                num_envs=1,
+            )
+            target = vec if not hasattr(vec, "envs") else vec.envs[0]
+            internal = getattr(target, "num_envs", 1)
+            for e in range(internal):
+                target.set_video_suffix(view_suffix, env_idx=e)
+
+            paths = self._render_view(vec, target, policy, args, view_idx, out_dir, step_suffix)
+            vec.close()
+            all_paths.extend(paths)
+        return all_paths
+
+    def _render_env_overrides(self, args) -> dict:
+        """Build env kwargs for the render env. Default: same as the
+        metric-pass env plus `render_mode=headless`. Subclasses override
+        to inject things like a random starting_map (gigaflow validation)
+        or a shrunken bin set (behavior class)."""
+        out = dict(args["env"])
+        out["render_mode"] = "headless"
+        return out
+
+    def _render_view(self, vecenv, target_env, policy, args, view_idx, out_dir, step_suffix) -> list:
+        """One rollout per render-env, writes one mp4 per active env per view.
+        Caps how many internal envs actually feed ffmpeg pipes via
+        `eval.render_num_scenarios` so render cost stays bounded."""
+        import os
+
+        import numpy as np
+        import torch
+
+        import pufferlib
+
+        device = args["train"]["device"]
+        num_agents = vecenv.observation_space.shape[0]
+
+        eval_cfg = self.config.get("eval", {})
+        metric_count = int(eval_cfg.get("num_scenarios", 1))
+        num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3)))
+        # Render-clip length: independent of scenario_length (which is the
+        # metric-pass length). At 30 fps, 300 steps = ~10s mp4. Per-step EGL
+        # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping
+        # this small directly bounds the render-pass runtime.
+        max_steps = int(eval_cfg.get("render_max_steps", 300))
+
+        saved_cwd = os.getcwd()
+        os.chdir(out_dir)
+        # Glob for files written this pass: every mp4 has the step suffix,
+        # so a step_suffix-prefixed glob filters out accumulated mp4s from
+        # prior epochs (the dir is shared across runs).
+        step_glob = f"*{step_suffix}*.mp4"
+        try:
+            state = self._init_lstm_state(num_agents, policy, device, args)
+            scenarios_processed = 0
+            while scenarios_processed < num_scenarios:
+                ob, _ = vecenv.reset()
+                scenarios = vecenv.get_state()
+                num_in_batch = len(scenarios)
+                # Cap how many envs render this iteration: the C kernel
+                # steps the full batch regardless, but only the first
+                # `to_render` envs feed ffmpeg pipes.
+                to_render = min(num_in_batch, num_scenarios - scenarios_processed)
+                if state:
+                    state["lstm_h"].zero_()
+                    state["lstm_c"].zero_()
+                for _ in range(max_steps):
+                    with torch.no_grad():
+                        ob_t = torch.as_tensor(ob).to(device)
+                        logits, _ = policy.forward_eval(ob_t, state)
+                        action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
+                        action = action.cpu().numpy().reshape(vecenv.action_space.shape)
+                    if isinstance(logits, torch.distributions.Normal):
+                        action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
+                    ob, _, _, _, _ = vecenv.step(action)
+                    for e in range(to_render):
+                        target_env.render(env_idx=e, view_mode=view_idx)
+                for e in range(to_render):
+                    target_env.close_client(env_idx=e)
+                scenarios_processed += to_render
+        finally:
+            os.chdir(saved_cwd)
+
+        return sorted(out_dir.glob(step_glob))
+
+
+_VIEW_NAME_TO_IDX = {
+    "sim_state": 0,
+    "bev": 1,
+    "topdown_sim": 2,
+    "bev_all": 3,
+}
diff --git a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
index 5a0861dd0..27930dea3 100644
--- a/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
+++ b/pufferlib/ocean/benchmark/evaluators/multi_scenario.py
@@ -1,14 +1,7 @@
-"""MultiScenarioEvaluator — distribute scenarios across workers, one rollout
-per scenario, mean per-scenario metrics. Drives both the gigaflow validation
-path and replay-style multi-scenario evals.
+"""MultiScenarioEvaluator — gigaflow validation eval. C-side eval_mode
+cycles maps sequentially in one batched rollout, so the base loop +
+PufferEnv defaults handle parallelism without multi-process workers."""
 
-Inherits the default loop from `Evaluator`; overrides `_should_stop` (cap by
-scenario count), `_initial_reset` (async reset for multi-worker throughput),
-`_maybe_reset_lstm` (per-scenario LSTM reset), and `_render_pass` (the C-side
-EGL → ffmpeg mp4 dump)."""
-
-import os
-from pathlib import Path
 from typing import ClassVar
 
 from pufferlib.ocean.benchmark.evaluators.base import Evaluator
@@ -26,15 +19,6 @@ def env_overrides(self) -> dict:
         env.update(self.config.get("env", {}))
         return env
 
-    # vec_overrides + _initial_reset use the base-class defaults: PufferEnv
-    # backend with num_envs=1 and a sync reset. Drive's C side already
-    # allocates `min(ceil(num_agents/max_per_env), num_eval_scenarios)`
-    # internal envs and steps them in one batched kernel call, so we get
-    # full per-map parallelism without paying multi-process fork/IPC cost.
-    # Override [eval.<name>.vec] in the ini if you genuinely need workers.
-
-    # -- Loop hooks --
-
     def _maybe_reset_lstm(self, state, steps, args):
         # Reset between scenarios — gigaflow's auto-resample fires at the
         # end of scenario_length, so steps % scenario_length == 0 is the
@@ -50,149 +34,16 @@ def _should_stop(self, args, infos_collected, steps) -> bool:
         target = int(self.config.get("eval", {}).get("num_scenarios", 1))
         return len(infos_collected) >= target
 
-    # -- Render --
-
-    def _render_pass(self, vecenv, policy, args) -> list:
-        """One rollout per view, all writing mp4s to a single dir.
-
-        Builds a fresh single-worker env per view (C-side ffmpeg-per-env
-        wiring assumes one bin at a time per process). Render budget and
-        starting position are independent of the metric pass:
-
-          eval.render_num_scenarios — how many scenarios to render. Defaults
-              to min(eval.num_scenarios, 3). Always respected over
-              num_scenarios so renders stay cheap.
-          starting_map — randomized per render epoch so successive epochs
-              show different scenarios from the dir, not the same first-N
-              alphabetically. Set explicitly in env.* to pin.
-        """
-        import importlib
+    def _render_env_overrides(self, args) -> dict:
+        # Random starting_map per render epoch — every epoch shows a
+        # different bin from the dir rather than the same alphabetical
+        # first-N. Pin by setting env.starting_map explicitly in the
+        # [eval.<name>] section.
         import random
 
-        import pufferlib
-
-        backend = args.get("render_backend", "egl")
-        if backend != "egl":
-            return []
-
-        env_name = args["env_name"]
-        out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4"
-        out_dir.mkdir(parents=True, exist_ok=True)
-
-        package = args.get("package", "ocean")
-        module_name = "pufferlib.ocean" if package == "ocean" else f"pufferlib.environments.{package}"
-        env_module = importlib.import_module(module_name)
-        make_env = env_module.env_creator(env_name)
-
-        render_env_kwargs = dict(args["env"])
-        render_env_kwargs["render_mode"] = "headless"
-
-        # Random starting map per render epoch — every epoch shows a
-        # different bin from the directory rather than the first N
-        # alphabetically. The user can pin by setting env.starting_map
-        # explicitly in the [eval.<name>] section.
+        out = super()._render_env_overrides(args)
         if "starting_map" not in self.config.get("env", {}):
-            num_maps = int(render_env_kwargs.get("num_maps", 1))
+            num_maps = int(out.get("num_maps", 1))
             if num_maps > 1:
-                render_env_kwargs["starting_map"] = random.randint(0, num_maps - 1)
-
-        # Stamp the training step into the filename so successive epochs
-        # produce distinct mp4s (Town01.xodr_step25100000_bev.mp4) instead
-        # of overwriting in place. wandb then shows one entry per epoch
-        # in the render carousel — useful for watching policy evolve over
-        # training. global_step falls back to 0 for ad-hoc CLI runs.
-        step_suffix = f"_step{int(args.get('global_step') or 0)}"
-
-        all_paths = []
-        for view in self.render_views:
-            view_idx = _VIEW_NAME_TO_IDX.get(view, 0)
-            view_suffix = step_suffix + ("" if view == "sim_state" else f"_{view}")
-
-            # PufferEnv backend treats the creator as a single callable and
-            # passes env_args/env_kwargs to it directly (not as per-env lists).
-            # The Multiprocessing/Serial backends expect lists; we don't use
-            # those here because EGL render assumes one ffmpeg pipe per env.
-            vec = pufferlib.vector.make(
-                make_env,
-                env_args=[],
-                env_kwargs=render_env_kwargs,
-                backend="PufferEnv",
-                num_envs=1,
-            )
-            target = vec if not hasattr(vec, "envs") else vec.envs[0]
-            internal = getattr(target, "num_envs", 1)
-            for e in range(internal):
-                target.set_video_suffix(view_suffix, env_idx=e)
-
-            paths = self._render_view(vec, target, policy, args, view_idx, out_dir)
-            vec.close()
-            all_paths.extend(paths)
-        return all_paths
-
-    def _render_view(self, vecenv, target_env, policy, args, view_idx: int, out_dir: Path) -> list:
-        import numpy as np
-        import torch
-
-        import pufferlib
-
-        device = args["train"]["device"]
-        num_agents = vecenv.observation_space.shape[0]
-        # Render budget defaults to min(num_scenarios, 3) if not set explicitly.
-        # Renders are expensive (mp4 encode + wandb upload) so we don't want
-        # them at metric-pass scale.
-        eval_cfg = self.config.get("eval", {})
-        metric_count = int(eval_cfg.get("num_scenarios", 1))
-        num_scenarios = int(eval_cfg.get("render_num_scenarios", min(metric_count, 3)))
-        # Render-clip length: independent of scenario_length (which is the
-        # metric-pass length). At 30 fps, 300 steps = ~10s mp4. Per-step EGL
-        # render is the bottleneck (~3 fps wall-clock at 1080p), so keeping
-        # this small directly bounds the render-pass runtime.
-        max_steps = int(args.get("eval", {}).get("render_max_steps", 300))
-
-        saved_cwd = os.getcwd()
-        os.chdir(out_dir)
-        # Filename pattern for this pass: each scenario writes
-        # `{scenario_id}_step{N}{view_suffix}.mp4`. Globbing by step suffix
-        # picks up only this-pass mp4s and ignores accumulated files from
-        # prior epochs that share the dir. Source of truth for the suffix
-        # is _render_pass (it set_video_suffix'd each env before calling
-        # us), so we read it back from any active env to keep them aligned.
-        step_glob = f"*_step{int(args.get('global_step') or 0)}*.mp4"
-        try:
-            state = self._init_lstm_state(num_agents, policy, device, args)
-            scenarios_processed = 0
-            while scenarios_processed < num_scenarios:
-                ob, _ = vecenv.reset()
-                scenarios = vecenv.get_state()
-                num_in_batch = len(scenarios)
-                remaining = num_scenarios - scenarios_processed - num_in_batch
-                target_env.batch_size_eval = max(1, remaining)
-                if state:
-                    state["lstm_h"].zero_()
-                    state["lstm_c"].zero_()
-                for _ in range(max_steps):
-                    with torch.no_grad():
-                        ob_t = torch.as_tensor(ob).to(device)
-                        logits, _ = policy.forward_eval(ob_t, state)
-                        action, _, _ = pufferlib.pytorch.sample_logits(logits, deterministic=True)
-                        action = action.cpu().numpy().reshape(vecenv.action_space.shape)
-                    if isinstance(logits, torch.distributions.Normal):
-                        action = np.clip(action, vecenv.action_space.low, vecenv.action_space.high)
-                    ob, _, _, _, _ = vecenv.step(action)
-                    for e in range(num_in_batch):
-                        target_env.render(env_idx=e, view_mode=view_idx)
-                for e in range(num_in_batch):
-                    target_env.close_client(env_idx=e)
-                scenarios_processed += num_in_batch
-        finally:
-            os.chdir(saved_cwd)
-
-        return sorted(out_dir.glob(step_glob))
-
-
-_VIEW_NAME_TO_IDX = {
-    "sim_state": 0,
-    "bev": 1,
-    "topdown_sim": 2,
-    "bev_all": 3,
-}
+                out["starting_map"] = random.randint(0, num_maps - 1)
+        return out

From 7253022eeba69a872ed67c97bfb539a1d4d91fe3 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <eugenevinitsky@lab1040imac03.mtc.scps.nyu.edu>
Date: Sun, 10 May 2026 02:12:40 -0400
Subject: [PATCH 26/26] [WIP] eval: render output into per-evaluator subdir

Every evaluator runs at the same global_step, so the previous shared
out_dir + step glob made each evaluator's _render_view re-collect every
earlier evaluator's mp4s into result.frames. wandb then logged
validation_gigaflow's videos under behaviors_*/render too.

Per-evaluator subdir (`mp4/<ev_name>/`) keeps each evaluator's render
output isolated.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/benchmark/evaluators/base.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pufferlib/ocean/benchmark/evaluators/base.py b/pufferlib/ocean/benchmark/evaluators/base.py
index 999102ca6..edf839d7e 100644
--- a/pufferlib/ocean/benchmark/evaluators/base.py
+++ b/pufferlib/ocean/benchmark/evaluators/base.py
@@ -175,7 +175,11 @@ def _render_pass(self, vecenv, policy, args) -> list:
 
         import pufferlib
 
-        out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4"
+        # Per-evaluator subdir so each evaluator's mp4s don't get re-globbed
+        # by the next evaluator's _render_view (every evaluator runs at the
+        # same global_step, so a shared dir + step glob would collect every
+        # earlier evaluator's mp4s into this one's result.frames).
+        out_dir = Path(args.get("render_results_dir") or args.get("eval_results_dir") or ".") / "mp4" / self.name
         out_dir.mkdir(parents=True, exist_ok=True)
 
         package = args.get("package", "ocean")