From 89cef8de8a0fd2cf6707bca43cc4fe564901f358 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Sat, 18 Apr 2026 21:38:16 -0400
Subject: [PATCH 01/11] Clean inline eval: zero perturbations/dropout, enforce
 red lights
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- build_eval_overrides: always force perturbations to 0 (safe: no obs
  shape change). Add a clean= kwarg that additionally zeros road-segment
  dropout and flips traffic_light_behavior to 1 (stop at red).
- _swap_policy_obs_counts: context manager that temporarily aligns the
  live training policy's obs_{lane,boundary}_segment_count with the eval
  env. The GigaFlow encoder's lane/boundary encoders are shared MLPs +
  max-pool over segments — weights are count-invariant, only slicing
  depends on these counts. So the same training policy runs correctly on
  a clean env with zero dropout (larger obs buffer) once we swap.
- eval_multi_scenarios accepts clean= and wraps the forward loop with
  the swap when clean is True.
- Inline eval call site (multi_scenario_eval) reads eval.clean_eval from
  the config and plumbs clean= all the way through.
- drive.ini: add eval.clean_eval=True default.

Fixes "no validation metrics": the inline render path (which HAD been
running on multi_scenario_render_interval) only populates global_infos
when an episode completes, but render_max_steps<<scenario_length so
episodes never end. Use multi_scenario_eval=True to get real numbers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini |   6 ++
 pufferlib/pufferl.py             | 106 +++++++++++++++++++++++++++----
 2 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index e71db0260..c2bdd0e3e 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -206,6 +206,12 @@ multi_scenario_render_interval = 250
 multi_scenario_render_backend = egl
 ; Frequency of evaluation during training (in epochs)
 eval_interval = 25
+; When True, inline eval zeroes road-segment dropout + perturbations and
+; enforces red-light stops. Metrics then reflect performance under clean
+; conditions. The live training policy is re-aligned to the clean env's
+; obs shape via _swap_policy_obs_counts — safe because the GigaFlow
+; encoder is count-invariant (shared MLP + max-pool over segments).
+clean_eval = True
 num_agents = 512
 ; Batch size for eval_multi_scenarios (number of scenarios per batch)
 ; Path to dataset used for evaluation
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 33d1f438e..d6b06e13d 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -501,13 +501,20 @@ def train(self):
             num_agents_eval = self.config["eval"]["num_agents"]
             map_dir = self.config["eval"]["map_dir"]
 
-            # Build eval_overrides using helper function
+            # Inline eval runs "clean" by default — perturbations + dropout off,
+            # red-light stops enforced — so the logged validation metrics
+            # track progress under controlled conditions rather than noisy
+            # training perturbations. The live training policy's road slicing
+            # is re-aligned to the clean env at eval time via
+            # _swap_policy_obs_counts inside eval_multi_scenarios.
+            clean_eval = self.config["eval"].get("clean_eval", True)
             eval_overrides = build_eval_overrides(
                 simulation_mode=eval_simulation_mode,
                 num_agents=num_agents_eval,
                 num_scenarios=self.config["eval"]["multi_scenario_num_scenarios"],
                 map_dir=map_dir,
                 num_carla_maps=self.config["eval"].get("num_carla_maps", 8),
+                clean=clean_eval,
             )
 
             # Build eval args by applying overrides to training config
@@ -546,6 +553,7 @@ def train(self):
                 logger=self.logger,  # Pass logger for TensorBoard logging
                 metric_prefix="validation",  # Use validation_ prefix
                 quiet=True,  # Suppress verbose output during inline eval
+                clean=clean_eval,
             )
 
         # Multi-scenario render — independent interval so the heavier render
@@ -1864,20 +1872,30 @@ def load_eval_multi_scenarios_config(env_name, model_path=None, eval_overrides=N
     return args
 
 
-def build_eval_overrides(simulation_mode, num_agents, num_scenarios, map_dir=None, num_carla_maps=8):
+def build_eval_overrides(simulation_mode, num_agents, num_scenarios, map_dir=None, num_carla_maps=8, clean=False):
     """Build evaluation overrides for a given simulation mode.
 
     Args:
         simulation_mode: "gigaflow" or "replay"
         num_agents: agent slot budget for evaluation
         map_dir: replay dataset directory, required for replay mode
+        clean: if True, run a "clean" eval — zero road-segment dropout and
+            enforce red-light stops. Only safe when the policy is rebuilt
+            from the eval env (standalone eval / render_scenario.py). Inline
+            eval during training reuses the live training policy, whose
+            encoder was built for the training obs shape; zeroing dropout
+            there changes the obs shape and triggers a CUDA device-side
+            assert. Perturbation probabilities (partner_blindness,
+            phantom_braking) are always forced to zero at eval — they're
+            pure randomness, they don't change the obs shape, and eval
+            should be deterministic regardless of clean mode.
     """
     # Common reward coefficients (same for both modes)
     common_env = {
         "eval_mode": 1,
         "collision_behavior": 1,
         "offroad_behavior": 1,
-        "traffic_light_behavior": 0,
+        "traffic_light_behavior": 1 if clean else 0,
         "reward_randomization": False,
         "reward_vehicle_collision": 3.0,
         "reward_offroad_collision": 3.0,
@@ -1889,15 +1907,22 @@ def build_eval_overrides(simulation_mode, num_agents, num_scenarios, map_dir=Non
         "reward_lane_align": 0.025,
         "reward_lane_center": 0.0038,
         "reward_timestep": 0.000025,
-        # NOTE: do not override lane_segment_dropout, boundary_segment_dropout,
-        # or max_{lane,boundary}_segment_observations here. All of these change
-        # the observation vector shape, and the render path reuses the live
-        # training policy which was built for the training obs sizes. Setting
-        # dropout to 0.0 here when training uses >0 causes the eval env to
-        # produce larger observations than the policy expects, triggering a
-        # CUDA device-side assert (scatter/gather index out of bounds).
+        # Always zero perturbations at eval. These don't change obs shape so
+        # it's safe to force even for inline eval, and a deterministic eval
+        # is what we want for tracking progress.
+        "partner_blindness_prob": 0.0,
+        "phantom_braking_prob": 0.0,
+        "phantom_braking_trigger_prob": 0.0,
     }
 
+    if clean:
+        # Dropout changes the obs shape. Only safe when the policy is
+        # rebuilt from the eval env (standalone eval / render_scenario).
+        # NEVER pass clean=True from an inline-eval call site — the live
+        # training policy's encoder was built for the training obs shape.
+        common_env["lane_segment_dropout"] = 0.0
+        common_env["boundary_segment_dropout"] = 0.0
+
     if simulation_mode == "gigaflow":
         eval_overrides = {
             "env": {
@@ -1938,6 +1963,46 @@ def build_eval_overrides(simulation_mode, num_agents, num_scenarios, map_dir=Non
     return eval_overrides
 
 
+@contextlib.contextmanager
+def _swap_policy_obs_counts(policy, vecenv):
+    """Temporarily align the policy's road-segment slicing with the eval env.
+
+    Training uses dropout > 0 → smaller obs_{lane,boundary}_segment_count.
+    Clean eval uses dropout = 0 → larger counts, larger obs buffer. The
+    GigaFlow encoder (lane_encoder / boundary_encoder) is a shared MLP
+    applied per-segment with max-pool — its weights are count-invariant.
+    Only the obs-buffer slicing in DriveBackbone.forward depends on these
+    counts, so we can just swap them for the duration of the eval and the
+    same training policy works on the larger clean obs.
+    """
+    try:
+        eval_env = vecenv.driver_env
+        new_lane = int(eval_env.obs_lane_segment_count)
+        new_boundary = int(eval_env.obs_boundary_segment_count)
+    except AttributeError:
+        # If the eval env doesn't expose these (unknown wrapper), skip the
+        # swap — forward will still work when training and eval obs shapes
+        # coincide (clean=False or no dropout configured).
+        yield
+        return
+
+    targets = []
+    for m in policy.modules():
+        if hasattr(m, "obs_lane_segment_count") and hasattr(m, "obs_boundary_segment_count"):
+            targets.append(m)
+
+    saved = [(m.obs_lane_segment_count, m.obs_boundary_segment_count) for m in targets]
+    try:
+        for m in targets:
+            m.obs_lane_segment_count = new_lane
+            m.obs_boundary_segment_count = new_boundary
+        yield
+    finally:
+        for m, (orig_lane, orig_boundary) in zip(targets, saved):
+            m.obs_lane_segment_count = orig_lane
+            m.obs_boundary_segment_count = orig_boundary
+
+
 def verify_scenario_coverage(csv_path: str, num_scenarios: int) -> dict:
     """
     Verify that episode_metrics.csv contains all expected scenarios.
@@ -2087,7 +2152,14 @@ def _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet):
 
 
 def eval_multi_scenarios(
-    env_name, args=None, vecenv=None, policy=None, logger=None, metric_prefix="validation", quiet=False
+    env_name,
+    args=None,
+    vecenv=None,
+    policy=None,
+    logger=None,
+    metric_prefix="validation",
+    quiet=False,
+    clean=False,
 ):
     t0 = time.time()
 
@@ -2097,14 +2169,20 @@ def eval_multi_scenarios(
         num_agents_eval = tmp_args["eval"]["num_agents"]
         map_dir = tmp_args["eval"]["map_dir"]
 
+        # CLI standalone entry point: read clean_eval from the eval section
+        # so users can enable it via --eval.clean-eval. Inline callers pass
+        # clean= directly and come in through the args-provided branch.
+        clean_from_config = tmp_args["eval"].get("clean_eval", False)
         eval_overrides = build_eval_overrides(
             simulation_mode=tmp_args["eval_simulation"],
             num_agents=num_agents_eval,
             num_scenarios=tmp_args["num_scenarios"],
             map_dir=map_dir,
             num_carla_maps=tmp_args.get("num_carla_maps", 8),
+            clean=clean_from_config,
         )
         args = load_eval_multi_scenarios_config(env_name, model_path, eval_overrides)
+        clean = clean or clean_from_config
 
     # Reproducibility — same approach as training
     seed = args["train"]["seed"] or 42
@@ -2183,7 +2261,11 @@ def eval_multi_scenarios(
     vecenv.async_reset(42)
 
     ob, _, _, _, infos, _, _ = vecenv.recv()
-    with tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar:
+    # Clean eval may use different road-dropout than training. The shared
+    # training policy's obs slicing needs to be aligned with this env; see
+    # _swap_policy_obs_counts.
+    swap_ctx = _swap_policy_obs_counts(policy, vecenv) if clean else contextlib.nullcontext()
+    with swap_ctx, tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar:
         while scenarios_processed < num_scenarios:
             # Reset LSTM
             if args["train"]["use_rnn"]:

From d50b26cd0c8af5697e75a9965c665574dcc7f3c2 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Sat, 18 Apr 2026 21:53:41 -0400
Subject: [PATCH 02/11] Remove stale _sys_instr debug writes from
 eval_multi_scenarios
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The _sys_instr.stderr.write instrumentation was added as debug tracing
for the render path's close_client sequence. It was also dropped into
eval_multi_scenarios by mistake — but _sys_instr is only imported
inside eval_multi_scenarios_render, so the non-render eval path
crashes with NameError whenever it runs to completion.

Just remove the instrumentation from the non-render eval function.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index d6b06e13d..6581c6b0c 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -2312,12 +2312,8 @@ def eval_multi_scenarios(
     print(f"\nTotal evaluation time: {time.time() - t0:.2f} seconds for {num_scenarios} scenarios.")
     _log_eval_metrics(logger, avg_infos, args, metric_prefix, quiet)
 
-    _sys_instr.stderr.write("[render-instr] about to call vecenv.close()\n")
-    _sys_instr.stderr.flush()
     # Close vectorized environment to avoid file descriptor leaks
     vecenv.close()
-    _sys_instr.stderr.write("[render-instr] vecenv.close() returned\n")
-    _sys_instr.stderr.flush()
 
 
 def eval_multi_scenarios_render(

From 3b8487dc48563c15ad397126438c6c9186c64322 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Sun, 19 Apr 2026 19:37:57 -0400
Subject: [PATCH 03/11] Add replay eval block (sibling of multi_scenario_eval)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Runs inline eval on replay scenarios (nuPlan mini train bins) alongside
the gigaflow multi_scenario_eval. Policy controls only the SDC
(control_sdc_only) while other agents follow logged trajectories.
Metrics log under metric_prefix "validation_replay" so they're
distinguishable from the gigaflow "validation" eval.

- drive.ini: add [eval].replay_eval + replay_map_dir +
  replay_num_scenarios + replay_scenario_length (201 for nuPlan
  duration_s=20 bins) + replay_control_mode + replay_init_steps.
- pufferl.py: sibling block in _train that overrides the default WOMD
  replay scenario_length (91) on top of build_eval_overrides and calls
  eval_multi_scenarios with metric_prefix="validation_replay". Shares
  the clean= plumbing and _swap_policy_obs_counts — the replay env
  with clean=True has a different obs shape from training, and the
  swap keeps the live training policy usable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/config/ocean/drive.ini | 15 +++++++++
 pufferlib/pufferl.py             | 56 ++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
index c2bdd0e3e..159d1455f 100644
--- a/pufferlib/config/ocean/drive.ini
+++ b/pufferlib/config/ocean/drive.ini
@@ -227,6 +227,21 @@ multi_scenario_num_scenarios = 250
 ; Cap the render rollout at this many steps.
 render_max_steps = 200
 backend = PufferEnv
+; --- Replay eval (sibling of multi_scenario_eval) ---
+; When True, fires a second inline eval on replay scenarios (nuPlan, WOMD)
+; at the same cadence as eval_interval. Runs alongside the gigaflow
+; multi_scenario_eval, logs under metric_prefix "validation_replay".
+; Honors clean_eval same as gigaflow eval.
+replay_eval = False
+replay_map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins"
+replay_num_scenarios = 16
+; Full trajectory length in the replay bins. nuPlan mini with duration_s=20
+; yields 201 steps (20.1s @ 10Hz); WOMD bins are 91.
+replay_scenario_length = 201
+replay_control_mode = "control_sdc_only"
+; Timestep to start replay from. Leaves a short pre-roll so agents have
+; logged state; policy takes over at this step.
+replay_init_steps = 10
 ; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
 ; If True, enables evaluation on realism metrics each time we save a checkpoint
 wosac_realism_eval = False
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 6581c6b0c..9b5e81f06 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -556,6 +556,62 @@ def train(self):
                 clean=clean_eval,
             )
 
+        # Replay eval — sibling of multi_scenario_eval. Runs inline eval on
+        # replay scenarios (e.g. nuPlan mini train bins) with control_sdc_only
+        # so the policy drives the SDC while other agents follow their logged
+        # trajectories. Metrics logged under "validation_replay/".
+        if self.config["eval"].get("replay_eval") and (
+            self.epoch % self.config["eval"]["eval_interval"] == 0 or done_training
+        ):
+            clean_eval = self.config["eval"].get("clean_eval", True)
+            replay_num_scenarios = int(self.config["eval"]["replay_num_scenarios"])
+            replay_scenario_length = int(self.config["eval"]["replay_scenario_length"])
+            replay_overrides = build_eval_overrides(
+                simulation_mode="replay",
+                num_agents=self.config["eval"]["num_agents"],
+                num_scenarios=replay_num_scenarios,
+                map_dir=self.config["eval"]["replay_map_dir"],
+                clean=clean_eval,
+            )
+            # Override defaults that build_eval_overrides sets for WOMD replay
+            # (scenario_length=91) — nuPlan with duration_s=20 needs 201.
+            replay_overrides["env"]["scenario_length"] = replay_scenario_length
+            replay_overrides["env"]["resample_frequency"] = replay_scenario_length
+            replay_overrides["env"]["control_mode"] = self.config["eval"]["replay_control_mode"]
+            replay_overrides["env"]["init_steps"] = int(self.config["eval"]["replay_init_steps"])
+
+            replay_args = load_eval_multi_scenarios_config(
+                env_name=self.config["env"],
+                model_path=None,
+                eval_overrides=replay_overrides,
+            )
+            replay_args["global_step"] = self.global_step
+            replay_args["num_scenarios"] = replay_num_scenarios
+            replay_args["eval_simulation"] = "replay"
+            replay_args["inline_eval"] = True
+            experiment_name = f"{self.config['env']}_{self.logger.run_id}"
+            replay_args["load_model_path"] = os.path.join(
+                self.config["data_dir"], experiment_name, "models", f"inline_epoch_{self.epoch}.pt"
+            )
+            replay_args["eval_results_dir"] = os.path.join(
+                self.config["data_dir"],
+                experiment_name,
+                "validation_replay",
+                f"epoch_{self.epoch}",
+                "replay",
+            )
+            print(f"\n🔄 Running replay eval at step {self.global_step}...")
+            eval_multi_scenarios(
+                env_name=self.config["env"],
+                args=replay_args,
+                vecenv=None,
+                policy=self.uncompiled_policy,
+                logger=self.logger,
+                metric_prefix="validation_replay",
+                quiet=True,
+                clean=clean_eval,
+            )
+
         # Multi-scenario render — independent interval so the heavier render
         # path doesn't have to fire every eval_interval. Mirrors the block
         # above but calls eval_multi_scenarios_render with render=True and

From 708a8293872ce476d529139431bf8cfbab4f705d Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Sun, 19 Apr 2026 19:42:14 -0400
Subject: [PATCH 04/11] render_scenario: add --no-render mode for aggregate
 metrics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When --no-render is set, calls eval_multi_scenarios across every .bin
in --map-dir instead of rendering a single scenario to mp4. Produces
evaluation_summary.csv in --output-dir.

Replaces the inline dropout/perturbation-zero overrides with
build_eval_overrides(clean=True) — same effect, but centralizes the
clean-eval logic in one place.

For replay metrics we leave offroad_behavior / collision_behavior at
the eval default (=1, terminate on infraction) so the SDC is penalized
per normal eval rules. The render path still forces them to 0 so the
video shows the full trajectory even when the policy is far off.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/render_scenario.py | 74 ++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 19 deletions(-)

diff --git a/scripts/render_scenario.py b/scripts/render_scenario.py
index b77d75aab..8848cb82b 100644
--- a/scripts/render_scenario.py
+++ b/scripts/render_scenario.py
@@ -62,6 +62,17 @@ def main():
         default=None,
         help="Override control mode (default: control_vehicles for gigaflow, control_sdc_only for replay)",
     )
+    parser.add_argument(
+        "--no-render",
+        action="store_true",
+        help="Skip video output and run eval_multi_scenarios to aggregate metrics across all bins in --map-dir. Ignores --map when set.",
+    )
+    parser.add_argument(
+        "--num-scenarios",
+        type=int,
+        default=None,
+        help="Number of scenarios to evaluate in --no-render mode (defaults to every bin in --map-dir)",
+    )
     cli = parser.parse_args()
 
     # Suppress argparse pollution from pufferl's load_config after our own parse
@@ -89,6 +100,11 @@ def main():
     # Set up a single-map directory if not provided
     if cli.map_dir:
         map_dir = os.path.abspath(cli.map_dir)
+    elif cli.no_render:
+        # No-render mode requires --map-dir explicitly (we aggregate across the
+        # whole directory rather than symlink a single bin).
+        print("Error: --no-render requires --map-dir pointing at a directory of .bin files.")
+        sys.exit(1)
     else:
         map_dir = tempfile.mkdtemp(prefix=f"render_{cli.map}_")
         # Search for the map .bin in known directories
@@ -116,6 +132,7 @@ def main():
 
     from pufferlib.pufferl import (
         build_eval_overrides,
+        eval_multi_scenarios,
         eval_multi_scenarios_render,
         load_config,
         load_eval_multi_scenarios_config,
@@ -124,30 +141,30 @@ def main():
     env_name = "puffer_drive"
     tmp_args = load_config(env_name)
 
+    if cli.no_render:
+        num_scenarios = cli.num_scenarios or len([f for f in os.listdir(map_dir) if f.endswith(".bin")])
+    else:
+        num_scenarios = 1
+
     eval_overrides = build_eval_overrides(
         simulation_mode=cli.simulation_mode,
         num_agents=cli.num_eval_agents,
-        num_scenarios=1,
+        num_scenarios=num_scenarios,
         map_dir=map_dir,
-        num_carla_maps=1,
+        num_carla_maps=num_scenarios if cli.no_render else 1,
+        clean=True,
     )
     # Override control_mode and init_steps after build_eval_overrides
     eval_overrides["env"]["control_mode"] = control_mode
     eval_overrides["env"]["init_steps"] = init_steps
-    # Disable robustness perturbations for eval renders — we want to see
-    # the policy's clean behavior, not randomly blinded or phantom-braking.
-    eval_overrides["env"]["partner_blindness_prob"] = 0.0
-    eval_overrides["env"]["phantom_braking_prob"] = 0.0
-    eval_overrides["env"]["phantom_braking_trigger_prob"] = 0.0
-    # Use all road segments at eval (no dropout). The obs vector is always
-    # max-sized now, so dropout=0 just fills more slots without changing shape.
-    eval_overrides["env"]["lane_segment_dropout"] = 0.0
-    eval_overrides["env"]["boundary_segment_dropout"] = 0.0
     if cli.simulation_mode == "replay":
-        # Don't stop/remove the SDC for offroad — let it drive freely so
-        # the video shows the full trajectory even with a mismatched policy.
-        eval_overrides["env"]["offroad_behavior"] = 0
-        eval_overrides["env"]["collision_behavior"] = 0
+        # For no-render metrics we want offroad/collision to TERMINATE so
+        # the SDC is penalized per the normal eval rules. For rendering we
+        # let the SDC keep driving so the video shows the full trajectory
+        # even when the policy is far off.
+        if not cli.no_render:
+            eval_overrides["env"]["offroad_behavior"] = 0
+            eval_overrides["env"]["collision_behavior"] = 0
         # Match scenario_length to requested steps so the render loop
         # doesn't cap at the default 91 from build_eval_overrides.
         eval_overrides["env"]["scenario_length"] = steps
@@ -155,11 +172,9 @@ def main():
 
     args = load_eval_multi_scenarios_config(env_name, cli.checkpoint, eval_overrides)
     args["load_model_path"] = cli.checkpoint
-    args["num_scenarios"] = 1
-    args["num_carla_maps"] = 1
+    args["num_scenarios"] = num_scenarios
+    args["num_carla_maps"] = num_scenarios if cli.no_render else 1
     args["eval_simulation"] = cli.simulation_mode
-    args["render"] = 1
-    args["render_obs"] = 0
     args["inline_eval"] = True
     args["eval_results_dir"] = cli.output_dir
 
@@ -172,6 +187,27 @@ def main():
     mode_desc = cli.simulation_mode
     if cli.simulation_mode == "replay":
         mode_desc += f" (control_mode={control_mode})"
+
+    if cli.no_render:
+        print(f"No-render eval | mode={mode_desc} | {steps} steps | {num_scenarios} scenarios")
+        print(f"Map dir: {map_dir}")
+        print(f"Checkpoint: {cli.checkpoint}")
+        print(f"Output: {cli.output_dir}/")
+        eval_multi_scenarios(
+            env_name=env_name,
+            args=dict(args),
+            vecenv=None,
+            policy=None,
+            logger=None,
+            metric_prefix="eval",
+            quiet=False,
+            clean=True,
+        )
+        print(f"\nDone. Summary CSV: {cli.output_dir}/evaluation_summary.csv")
+        return
+
+    args["render"] = 1
+    args["render_obs"] = 0
     print(f"Rendering {cli.map} | mode={mode_desc} | {steps} steps | view={cli.view}")
     print(f"Checkpoint: {cli.checkpoint}")
     print(f"Output: {cli.output_dir}/mp4/")

From f26aa19a3757186c69597fd2c0cb57ee2d8cf36a Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Sun, 19 Apr 2026 19:43:16 -0400
Subject: [PATCH 05/11] render_scenario: cap no-render workers at physical CPU
 count

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/render_scenario.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/scripts/render_scenario.py b/scripts/render_scenario.py
index 8848cb82b..5fe70d2de 100644
--- a/scripts/render_scenario.py
+++ b/scripts/render_scenario.py
@@ -56,7 +56,9 @@ def main():
         choices=["gigaflow", "replay"],
         help="Simulation mode: gigaflow (random spawn) or replay (log trajectories, policy controls SDC)",
     )
-    parser.add_argument("--init-steps", type=int, default=None, help="Timestep to start from (default: 0 gigaflow, 10 replay)")
+    parser.add_argument(
+        "--init-steps", type=int, default=None, help="Timestep to start from (default: 0 gigaflow, 10 replay)"
+    )
     parser.add_argument(
         "--control-mode",
         default=None,
@@ -189,7 +191,16 @@ def main():
         mode_desc += f" (control_mode={control_mode})"
 
     if cli.no_render:
-        print(f"No-render eval | mode={mode_desc} | {steps} steps | {num_scenarios} scenarios")
+        # Cap vec workers at physical CPU count — pufferlib rejects
+        # num_workers > cores (emerge2 has 16 physical cores, ini default 20).
+        import psutil
+
+        cpu_cores = psutil.cpu_count(logical=False) or 8
+        cap = min(cpu_cores, num_scenarios)
+        args["vec"]["num_envs"] = cap
+        args["vec"]["num_workers"] = cap
+        args["vec"]["batch_size"] = cap
+        print(f"No-render eval | mode={mode_desc} | {steps} steps | {num_scenarios} scenarios | {cap} workers")
         print(f"Map dir: {map_dir}")
         print(f"Checkpoint: {cli.checkpoint}")
         print(f"Output: {cli.output_dir}/")

From ad13b64058599d58a835c16f1e25094f78388130 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Sun, 19 Apr 2026 22:41:04 -0400
Subject: [PATCH 06/11] render.h: draw waypoints in BEV (obs_only=1) too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

draw_scene gated the waypoint draw loop on obs_only==0, so BEV view
(which uses obs_only=1) had no visible goals. Drop the gate — the
goal trail is the main reason to watch a BEV render.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/drive/render.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pufferlib/ocean/drive/render.h b/pufferlib/ocean/drive/render.h
index 1323d39f3..c47c802bc 100644
--- a/pufferlib/ocean/drive/render.h
+++ b/pufferlib/ocean/drive/render.h
@@ -1215,8 +1215,10 @@ void draw_scene(Drive *env, Client *client, int mode, int obs_only, int lasers,
         if (!is_active_agent || agent->sim_valid == 0) {
             continue;
         }
-        if (!IsKeyDown(KEY_LEFT_CONTROL) && obs_only == 0) {
-            // Draw all target waypoints: brightest (first) to darkest (last)
+        if (!IsKeyDown(KEY_LEFT_CONTROL)) {
+            // Draw all target waypoints: brightest (first) to darkest (last).
+            // Drawn in BEV too (obs_only=1) since the goal trail is the main
+            // thing you want to see when labeling scenarios.
             int num_wp = env->num_target_waypoints;
             if (num_wp > MAX_TARGET_WAYPOINTS)
                 num_wp = MAX_TARGET_WAYPOINTS;

From dbecd2597c00c3e50903220edc627d81d79a35c1 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Sun, 19 Apr 2026 23:01:55 -0400
Subject: [PATCH 07/11] Clean eval: bump max_partner_observations to 32, swap
 on policy too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends _swap_policy_obs_counts to also swap max_partner_observations
and max_traffic_control_observations — they're both shared-MLP +
max-pool encoders, so swapping the count on the live training policy
is safe and lets the policy consume a wider obs buffer at eval.

build_eval_overrides(clean=True) now sets max_partner_observations=32
(training default 16). In BEV render the extra partner observations
show up as more visible vehicles, matching the clean lane behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 55 +++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 9b5e81f06..52ca3f671 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -1973,11 +1973,14 @@ def build_eval_overrides(simulation_mode, num_agents, num_scenarios, map_dir=Non
 
     if clean:
         # Dropout changes the obs shape. Only safe when the policy is
-        # rebuilt from the eval env (standalone eval / render_scenario).
-        # NEVER pass clean=True from an inline-eval call site — the live
-        # training policy's encoder was built for the training obs shape.
+        # rebuilt from the eval env (standalone eval / render_scenario),
+        # OR when the inline caller uses _swap_policy_obs_counts.
         common_env["lane_segment_dropout"] = 0.0
         common_env["boundary_segment_dropout"] = 0.0
+        # Clean eval gets a bigger partner budget too — mirrors the lane
+        # story. The partner encoder is shared-MLP + max-pool, so the same
+        # policy weights handle the larger count via _swap_policy_obs_counts.
+        common_env["max_partner_observations"] = 32
 
     if simulation_mode == "gigaflow":
         eval_overrides = {
@@ -2019,44 +2022,48 @@ def build_eval_overrides(simulation_mode, num_agents, num_scenarios, map_dir=Non
     return eval_overrides
 
 
+_SWAPPABLE_OBS_COUNTS = (
+    "obs_lane_segment_count",
+    "obs_boundary_segment_count",
+    "max_partner_observations",
+    "max_traffic_control_observations",
+)
+
+
 @contextlib.contextmanager
 def _swap_policy_obs_counts(policy, vecenv):
-    """Temporarily align the policy's road-segment slicing with the eval env.
+    """Temporarily align the policy's obs slicing with the eval env.
 
-    Training uses dropout > 0 → smaller obs_{lane,boundary}_segment_count.
-    Clean eval uses dropout = 0 → larger counts, larger obs buffer. The
-    GigaFlow encoder (lane_encoder / boundary_encoder) is a shared MLP
-    applied per-segment with max-pool — its weights are count-invariant.
+    Training may use dropout > 0 or tighter partner/traffic-control caps,
+    giving smaller counts. Clean eval (or an eval with --max-partner-observations
+    bumped) produces a larger obs buffer. The GigaFlow encoders for lanes,
+    boundaries, partners, and traffic controls are all shared MLPs with
+    max-pool over their element dimension — weights are count-invariant.
     Only the obs-buffer slicing in DriveBackbone.forward depends on these
-    counts, so we can just swap them for the duration of the eval and the
-    same training policy works on the larger clean obs.
+    counts, so we swap them for the duration of the eval and the same
+    training policy works on the larger clean obs.
     """
     try:
         eval_env = vecenv.driver_env
-        new_lane = int(eval_env.obs_lane_segment_count)
-        new_boundary = int(eval_env.obs_boundary_segment_count)
+        new_counts = {name: int(getattr(eval_env, name)) for name in _SWAPPABLE_OBS_COUNTS}
     except AttributeError:
         # If the eval env doesn't expose these (unknown wrapper), skip the
         # swap — forward will still work when training and eval obs shapes
-        # coincide (clean=False or no dropout configured).
+        # coincide (clean=False or no overrides).
         yield
         return
 
-    targets = []
-    for m in policy.modules():
-        if hasattr(m, "obs_lane_segment_count") and hasattr(m, "obs_boundary_segment_count"):
-            targets.append(m)
-
-    saved = [(m.obs_lane_segment_count, m.obs_boundary_segment_count) for m in targets]
+    targets = [m for m in policy.modules() if all(hasattr(m, n) for n in _SWAPPABLE_OBS_COUNTS)]
+    saved = [{n: getattr(m, n) for n in _SWAPPABLE_OBS_COUNTS} for m in targets]
     try:
         for m in targets:
-            m.obs_lane_segment_count = new_lane
-            m.obs_boundary_segment_count = new_boundary
+            for n, v in new_counts.items():
+                setattr(m, n, v)
         yield
     finally:
-        for m, (orig_lane, orig_boundary) in zip(targets, saved):
-            m.obs_lane_segment_count = orig_lane
-            m.obs_boundary_segment_count = orig_boundary
+        for m, orig in zip(targets, saved):
+            for n, v in orig.items():
+                setattr(m, n, v)
 
 
 def verify_scenario_coverage(csv_path: str, num_scenarios: int) -> dict:

From f1479cffe4dc234135040b7cb09339bbfd8af099 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Mon, 20 Apr 2026 08:04:41 -0400
Subject: [PATCH 08/11] driving_behaviours_eval: force clean-eval overrides +
 nuPlan categories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

utils.py: run_driving_behaviours_eval_in_subprocess now passes
eval_mode=1, traffic_light_behavior=1, zero dropout, zero perturbations,
and max_partner_observations=32 — matches build_eval_overrides(clean=True).
Previously the subprocess re-parsed drive.ini and inherited whatever
defaults were there, so eval_mode stayed 0 (randomized TL cycle) and
training-time CLI overrides quietly dropped.

driving_behaviours_eval.ini: rebuilt around the nuPlan mini-train bins
labeled under /scratch/ev2237/data/nuplan/categories/<class>/. Eleven
sections (hard_stop, highway_straight, lane_change, merge, parked_cars,
roundabout, stopped_traffic, traffic_light_{green,stop}, unprotected_
{left,right}). Scenario length 201 for nuPlan duration_s=20.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../config/ocean/driving_behaviours_eval.ini  | 70 +++++++++++++------
 pufferlib/utils.py                            | 21 ++++++
 2 files changed, 70 insertions(+), 21 deletions(-)

diff --git a/pufferlib/config/ocean/driving_behaviours_eval.ini b/pufferlib/config/ocean/driving_behaviours_eval.ini
index 0951ad1b8..e82718c90 100644
--- a/pufferlib/config/ocean/driving_behaviours_eval.ini
+++ b/pufferlib/config/ocean/driving_behaviours_eval.ini
@@ -1,28 +1,56 @@
-; Configuration for driving behaviour evaluation using human-replay (WOMD) scenarios.
-; Evaluates 5 broad driving behaviour classes:
-;   lead vehicle interaction (longitudinal), lane change (lateral), dense traffic,
-;   obstacles, vulnerable road user interactions (VRUs).
-; Each class section sets the map_dir. num_agents is set automatically from the
-; number of .bin files in the directory.
+; Configuration for driving behaviour evaluation using nuPlan mini-train
+; scenarios labeled by scene type. Each section points at a folder of
+; .bin files under /scratch/ev2237/data/nuplan/categories/<class>.
+;
 ; Eval runs in REPLAY mode (simulation_mode=replay, control_mode=control_sdc_only)
-; using the same reward weights as training (no reward conditioning).
+; using the same reward weights as training (no reward conditioning). Scenario
+; length is 201 (nuPlan with duration_s=20 at 10Hz → 20.1s).
+;
+; Categories with an empty folder are omitted — driving_behaviours_eval errors
+; if map_dir has no .bin files. Add new categories by labeling more scenes
+; (see scripts/render_scenario.py --view bev) and copying them into the
+; corresponding /scratch/ev2237/data/nuplan/categories/<class>/ folder.
 
-[eval_lead_vehicle_interaction]
-map_dir = "pufferlib/resources/drive/binaries/longitudinal"
-scenario_length = 91
+[eval_hard_stop]
+map_dir = "/scratch/ev2237/data/nuplan/categories/hard_stop"
+scenario_length = 201
+
+[eval_highway_straight]
+map_dir = "/scratch/ev2237/data/nuplan/categories/highway_straight"
+scenario_length = 201
 
 [eval_lane_change]
-map_dir = "pufferlib/resources/drive/binaries/lateral"
-scenario_length = 91
+map_dir = "/scratch/ev2237/data/nuplan/categories/lane_change"
+scenario_length = 201
+
+[eval_merge]
+map_dir = "/scratch/ev2237/data/nuplan/categories/merge"
+scenario_length = 201
+
+[eval_parked_cars]
+map_dir = "/scratch/ev2237/data/nuplan/categories/parked_cars"
+scenario_length = 201
+
+[eval_roundabout]
+map_dir = "/scratch/ev2237/data/nuplan/categories/roundabout"
+scenario_length = 201
+
+[eval_stopped_traffic]
+map_dir = "/scratch/ev2237/data/nuplan/categories/stopped_traffic"
+scenario_length = 201
+
+[eval_traffic_light_green]
+map_dir = "/scratch/ev2237/data/nuplan/categories/traffic_light_green"
+scenario_length = 201
 
-[eval_dense_traffic]
-map_dir = "pufferlib/resources/drive/binaries/dense"
-scenario_length = 91
+[eval_traffic_light_stop]
+map_dir = "/scratch/ev2237/data/nuplan/categories/traffic_light_stop"
+scenario_length = 201
 
-[eval_obstacles]
-map_dir = "pufferlib/resources/drive/binaries/obstacles"
-scenario_length = 91
+[eval_unprotected_left]
+map_dir = "/scratch/ev2237/data/nuplan/categories/unprotected_left"
+scenario_length = 201
 
-[eval_vru_interaction]
-map_dir = "pufferlib/resources/drive/binaries/vru"
-scenario_length = 91
+[eval_unprotected_right]
+map_dir = "/scratch/ev2237/data/nuplan/categories/unprotected_right"
+scenario_length = 201
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index bb4f10b1d..dc6367e78 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -231,6 +231,27 @@ def run_driving_behaviours_eval_in_subprocess(config, logger, global_step, behav
                 "create_all_valid",
                 "--env.scenario-length",
                 str(scenario_length),
+                # Clean-eval overrides. Mirrors build_eval_overrides(clean=True):
+                # deterministic TL cycle (eval_mode=1), red lights enforced,
+                # no road-segment dropout, no partner blindness or phantom
+                # braking, wider partner budget. Subprocess re-parses the ini
+                # so training-time CLI overrides don't leak in here.
+                "--env.eval-mode",
+                "1",
+                "--env.traffic-light-behavior",
+                "1",
+                "--env.lane-segment-dropout",
+                "0.0",
+                "--env.boundary-segment-dropout",
+                "0.0",
+                "--env.partner-blindness-prob",
+                "0.0",
+                "--env.phantom-braking-prob",
+                "0.0",
+                "--env.phantom-braking-trigger-prob",
+                "0.0",
+                "--env.max-partner-observations",
+                "32",
             ]
 
             print(f"DrivingBehavioursEval: running class '{short}' with map_dir={map_dir}")

From 85df54b461ca5e6b2b62d88ada9c4b368bbb116f Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Mon, 20 Apr 2026 08:14:22 -0400
Subject: [PATCH 09/11] Pin dt=0.1 for all eval paths

Training can override dt for curriculum/speed experiments; eval needs
to stay at 10Hz so replay-env simulation matches the logged trajectory
sample rate. Otherwise waypoints drift against the SDC's actual path.

- build_eval_overrides (inline + standalone + render_scenario): dt=0.1
  added to common_env so it flows through regardless of clean mode.
- run_driving_behaviours_eval_in_subprocess: --env.dt 0.1 added to
  the subprocess cmd so it overrides whatever drive.ini default / CLI
  override the parent process had.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/pufferl.py | 5 +++++
 pufferlib/utils.py   | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index 52ca3f671..aa208ca2b 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -1949,6 +1949,11 @@ def build_eval_overrides(simulation_mode, num_agents, num_scenarios, map_dir=Non
     # Common reward coefficients (same for both modes)
     common_env = {
         "eval_mode": 1,
+        # Eval always runs at dt=0.1 (10Hz), the rate the logged trajectories
+        # were sampled at. Training may override dt for curriculum / speed
+        # experiments, but eval metrics need the logged-rate physics or
+        # replay positions drift against the real waypoints.
+        "dt": 0.1,
         "collision_behavior": 1,
         "offroad_behavior": 1,
         "traffic_light_behavior": 1 if clean else 0,
diff --git a/pufferlib/utils.py b/pufferlib/utils.py
index dc6367e78..cd27a36ac 100644
--- a/pufferlib/utils.py
+++ b/pufferlib/utils.py
@@ -231,6 +231,11 @@ def run_driving_behaviours_eval_in_subprocess(config, logger, global_step, behav
                 "create_all_valid",
                 "--env.scenario-length",
                 str(scenario_length),
+                # Pin dt to 0.1 (10Hz). Training may override dt for curriculum
+                # / speed experiments, but eval metrics need the logged-rate
+                # physics or replay positions drift against the real waypoints.
+                "--env.dt",
+                "0.1",
                 # Clean-eval overrides. Mirrors build_eval_overrides(clean=True):
                 # deterministic TL cycle (eval_mode=1), red lights enforced,
                 # no road-segment dropout, no partner blindness or phantom

From 02e9c96565489af349fed7065ec862a8437814e4 Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Mon, 20 Apr 2026 22:19:14 -0400
Subject: [PATCH 10/11] Render path: honor clean_eval + score >=3 instead of
 >=4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pufferl.py:
- eval_multi_scenarios_render takes a clean= kwarg, wraps the rollout
  loop with _swap_policy_obs_counts when set. Standalone entry now
  reads eval.clean_eval from the config.
- _render_driving_behaviours builds overrides with clean=True and
  passes clean=True to eval_multi_scenarios_render. Matches the
  metric-eval subprocess so the mp4s reflect the same clean conditions
  the wandb scalars do (no more flashing BEVs from inherited dropout).
- _train multi_scenario_render block: same — reads eval.clean_eval,
  plumbs to build_eval_overrides + eval_multi_scenarios_render.

drive.h:
- compute_metrics score threshold was hardcoded >=4, but num_target_
  waypoints=3 caps num_goals_reached at 3, so score was always 0.
  Changed to >=3. Removes the TODO/FIXME comments.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pufferlib/ocean/drive/drive.h |  7 ++++---
 pufferlib/pufferl.py          | 19 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
index af51ec260..5e56efbb2 100644
--- a/pufferlib/ocean/drive/drive.h
+++ b/pufferlib/ocean/drive/drive.h
@@ -2599,9 +2599,10 @@ static void add_log(Drive *env) {
         env->log.num_waypoints_reached += num_waypoints_reached;
         int num_goals_reached = env->logs[i].num_goals_reached;
         env->log.num_goals_reached += num_goals_reached;
-        // TODO: define better scoring criteria ?
-        // FIXME
-        if (num_goals_reached >= 4 && !agent->removed && !agent->stopped) {
+        // Score: 1 per agent that reached all 3 target waypoints without
+        // being removed/stopped. Was hardcoded to >=4, unreachable given
+        // num_target_waypoints=3 in the ini, so score was always 0.
+        if (num_goals_reached >= 3 && !agent->removed && !agent->stopped) {
             env->log.score += 1.0f;
         }
         if (!offroad && !collided && !red_light_violations && num_waypoints_reached < 1) {
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
index aa208ca2b..a0a3d5414 100644
--- a/pufferlib/pufferl.py
+++ b/pufferlib/pufferl.py
@@ -623,6 +623,7 @@ def train(self):
             render_simulation_mode = self.config["eval"]["multi_scenario_simulation_mode"]
             num_agents_render = self.config["eval"]["num_agents"]
             render_map_dir = self.config["eval"]["map_dir"]
+            clean_render = self.config["eval"].get("clean_eval", True)
 
             render_overrides = build_eval_overrides(
                 simulation_mode=render_simulation_mode,
@@ -630,6 +631,7 @@ def train(self):
                 num_scenarios=self.config["eval"]["multi_scenario_num_scenarios"],
                 map_dir=render_map_dir,
                 num_carla_maps=self.config["eval"].get("num_carla_maps", 8),
+                clean=clean_render,
             )
 
             render_args = load_eval_multi_scenarios_config(
@@ -690,6 +692,7 @@ def train(self):
                         # the mystery ~500-c_render-call abort is properly diagnosed.
                         # Set to 0/negative to disable the cap entirely.
                         render_max_steps=(self.config["eval"].get("render_max_steps", 50) or None),
+                        clean=clean_render,
                     )
                 except Exception as e:
                     import traceback
@@ -1027,11 +1030,17 @@ def _render_driving_behaviours(self, behaviours_config):
 
             short = class_name[len(EVAL_SECTIONS_PREFIX) :]
             num_maps = len([f for f in os.listdir(map_dir) if f.endswith(".bin")])
+            # Render under clean-eval conditions (zero dropout, zero
+            # perturbations, enforced red lights) so the mp4s show what
+            # the policy does under controlled eval, not the noisy
+            # training-time perturbations. Matches run_driving_behaviours
+            # _eval_in_subprocess, so the video matches the metric eval.
             render_overrides = build_eval_overrides(
                 simulation_mode="replay",
                 num_agents=1,
                 num_scenarios=1,
                 map_dir=map_dir,
+                clean=True,
             )
             render_overrides["env"]["control_mode"] = "control_sdc_only"
             render_overrides["env"]["num_maps"] = num_maps
@@ -1073,6 +1082,7 @@ def _render_driving_behaviours(self, behaviours_config):
                         video_suffix=vsuffix,
                         log_view_label=vlabel,
                         render_max_steps=(self.config["eval"].get("render_max_steps", 50) or None),
+                        clean=True,
                     )
                 except Exception as e:
                     import traceback
@@ -2398,6 +2408,7 @@ def eval_multi_scenarios_render(
     log_view_label="render",
     render_max_steps=None,
     render_key_prefix=None,
+    clean=False,
 ):
     # Set fixed seed for reproducible evaluation
     np.random.seed(42)
@@ -2408,14 +2419,17 @@ def eval_multi_scenarios_render(
         model_path = tmp_args.get("load_model_path")
         num_agents_eval = tmp_args["eval"]["num_agents"]
         map_dir = tmp_args["eval"]["map_dir"]
+        clean_from_config = tmp_args["eval"].get("clean_eval", False)
         eval_overrides = build_eval_overrides(
             simulation_mode=tmp_args["eval_simulation"],
             num_agents=num_agents_eval,
             num_scenarios=tmp_args["num_scenarios"],
             map_dir=map_dir,
             num_carla_maps=tmp_args.get("num_carla_maps", 8),
+            clean=clean_from_config,
         )
         args = load_eval_multi_scenarios_config(env_name, model_path, eval_overrides)
+        clean = clean or clean_from_config
 
     backend = args["vec"]["backend"]
     if backend != "PufferEnv":
@@ -2511,7 +2525,10 @@ def eval_multi_scenarios_render(
     # Serial/Multiprocessing: need vecenv.envs[0] to reach the underlying env.
     target_env = vecenv if not hasattr(vecenv, "envs") else vecenv.envs[0]
 
-    with tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar:
+    # Align the live training policy's obs slicing with the (potentially
+    # clean) eval env for the render. Same swap as eval_multi_scenarios.
+    swap_ctx = _swap_policy_obs_counts(policy, vecenv) if clean else contextlib.nullcontext()
+    with swap_ctx, tqdm(total=num_scenarios, desc="Processing scenarios", disable=quiet) as pbar:
         while scenarios_processed < num_scenarios:
             ob, _ = vecenv.reset()
 

From ff4e0df4f0a7d50be208fa1781ae46ad8f2a01fb Mon Sep 17 00:00:00 2001
From: Eugene Vinitsky <vinitsky.eugene@gmail.com>
Date: Tue, 21 Apr 2026 23:29:05 -0400
Subject: [PATCH 11/11] driving_behaviours_eval.ini: point at categories_v021
 (0.2.1 reconvert)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../config/ocean/driving_behaviours_eval.ini  | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/pufferlib/config/ocean/driving_behaviours_eval.ini b/pufferlib/config/ocean/driving_behaviours_eval.ini
index e82718c90..651fc27c0 100644
--- a/pufferlib/config/ocean/driving_behaviours_eval.ini
+++ b/pufferlib/config/ocean/driving_behaviours_eval.ini
@@ -1,6 +1,6 @@
 ; Configuration for driving behaviour evaluation using nuPlan mini-train
-; scenarios labeled by scene type. Each section points at a folder of
-; .bin files under /scratch/ev2237/data/nuplan/categories/<class>.
+; scenarios labeled by scene type. Built from py123d 0.2.1 reconvert of
+; .bin files under /scratch/ev2237/data/nuplan/categories_v021/<class>.
 ;
 ; Eval runs in REPLAY mode (simulation_mode=replay, control_mode=control_sdc_only)
 ; using the same reward weights as training (no reward conditioning). Scenario
@@ -9,48 +9,48 @@
 ; Categories with an empty folder are omitted — driving_behaviours_eval errors
 ; if map_dir has no .bin files. Add new categories by labeling more scenes
 ; (see scripts/render_scenario.py --view bev) and copying them into the
-; corresponding /scratch/ev2237/data/nuplan/categories/<class>/ folder.
+; corresponding /scratch/ev2237/data/nuplan/categories_v021/<class>/ folder.
 
 [eval_hard_stop]
-map_dir = "/scratch/ev2237/data/nuplan/categories/hard_stop"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop"
 scenario_length = 201
 
 [eval_highway_straight]
-map_dir = "/scratch/ev2237/data/nuplan/categories/highway_straight"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight"
 scenario_length = 201
 
 [eval_lane_change]
-map_dir = "/scratch/ev2237/data/nuplan/categories/lane_change"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change"
 scenario_length = 201
 
 [eval_merge]
-map_dir = "/scratch/ev2237/data/nuplan/categories/merge"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge"
 scenario_length = 201
 
 [eval_parked_cars]
-map_dir = "/scratch/ev2237/data/nuplan/categories/parked_cars"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars"
 scenario_length = 201
 
 [eval_roundabout]
-map_dir = "/scratch/ev2237/data/nuplan/categories/roundabout"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout"
 scenario_length = 201
 
 [eval_stopped_traffic]
-map_dir = "/scratch/ev2237/data/nuplan/categories/stopped_traffic"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic"
 scenario_length = 201
 
 [eval_traffic_light_green]
-map_dir = "/scratch/ev2237/data/nuplan/categories/traffic_light_green"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green"
 scenario_length = 201
 
 [eval_traffic_light_stop]
-map_dir = "/scratch/ev2237/data/nuplan/categories/traffic_light_stop"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop"
 scenario_length = 201
 
 [eval_unprotected_left]
-map_dir = "/scratch/ev2237/data/nuplan/categories/unprotected_left"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left"
 scenario_length = 201
 
 [eval_unprotected_right]
-map_dir = "/scratch/ev2237/data/nuplan/categories/unprotected_right"
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right"
 scenario_length = 201