Emerge-Lab · eugenevinitsky · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/pufferlib/config/ocean/drive.ini b/pufferlib/config/ocean/drive.ini
@@ -206,6 +206,12 @@ multi_scenario_render_interval = 250
 multi_scenario_render_backend = egl
 ; Frequency of evaluation during training (in epochs)
 eval_interval = 25
+; When True, inline eval zeroes road-segment dropout + perturbations and
+; enforces red-light stops. Metrics then reflect performance under clean
+; conditions. The live training policy is re-aligned to the clean env's
+; obs shape via _swap_policy_obs_counts — safe because the GigaFlow
+; encoder is count-invariant (shared MLP + max-pool over segments).
+clean_eval = True
 num_agents = 512
 ; Batch size for eval_multi_scenarios (number of scenarios per batch)
 ; Path to dataset used for evaluation
@@ -221,6 +227,21 @@ multi_scenario_num_scenarios = 250
 ; Cap the render rollout at this many steps.
 render_max_steps = 200
 backend = PufferEnv
+; --- Replay eval (sibling of multi_scenario_eval) ---
+; When True, fires a second inline eval on replay scenarios (nuPlan, WOMD)
+; at the same cadence as eval_interval. Runs alongside the gigaflow
+; multi_scenario_eval, logs under metric_prefix "validation_replay".
+; Honors clean_eval same as gigaflow eval.
+replay_eval = False
+replay_map_dir = "/scratch/ev2237/data/nuplan/nuplan_mini_train_bins"
+replay_num_scenarios = 16
+; Full trajectory length in the replay bins. nuPlan mini with duration_s=20
+; yields 201 steps (20.1s @ 10Hz); WOMD bins are 91.
+replay_scenario_length = 201
+replay_control_mode = "control_sdc_only"
+; Timestep to start replay from. Leaves a short pre-roll so agents have
+; logged state; policy takes over at this step.
+replay_init_steps = 10
 ; WOSAC (Waymo Open Sim Agents Challenge) evaluation settings
 ; If True, enables evaluation on realism metrics each time we save a checkpoint
 wosac_realism_eval = False

diff --git a/pufferlib/config/ocean/driving_behaviours_eval.ini b/pufferlib/config/ocean/driving_behaviours_eval.ini
@@ -1,28 +1,56 @@
-; Configuration for driving behaviour evaluation using human-replay (WOMD) scenarios.
-; Evaluates 5 broad driving behaviour classes:
-;   lead vehicle interaction (longitudinal), lane change (lateral), dense traffic,
-;   obstacles, vulnerable road user interactions (VRUs).
-; Each class section sets the map_dir. num_agents is set automatically from the
-; number of .bin files in the directory.
+; Configuration for driving behaviour evaluation using nuPlan mini-train
+; scenarios labeled by scene type. Built from py123d 0.2.1 reconvert of
+; .bin files under /scratch/ev2237/data/nuplan/categories_v021/<class>.
+;
 ; Eval runs in REPLAY mode (simulation_mode=replay, control_mode=control_sdc_only)
-; using the same reward weights as training (no reward conditioning).
+; using the same reward weights as training (no reward conditioning). Scenario
+; length is 201 (nuPlan with duration_s=20 at 10Hz → 20.1s).
+;
+; Categories with an empty folder are omitted — driving_behaviours_eval errors
+; if map_dir has no .bin files. Add new categories by labeling more scenes
+; (see scripts/render_scenario.py --view bev) and copying them into the
+; corresponding /scratch/ev2237/data/nuplan/categories_v021/<class>/ folder.
 
-[eval_lead_vehicle_interaction]
-map_dir = "pufferlib/resources/drive/binaries/longitudinal"
-scenario_length = 91
+[eval_hard_stop]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/hard_stop"
+scenario_length = 201
+
+[eval_highway_straight]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/highway_straight"
+scenario_length = 201
 
 [eval_lane_change]
-map_dir = "pufferlib/resources/drive/binaries/lateral"
-scenario_length = 91
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/lane_change"
+scenario_length = 201
+
+[eval_merge]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/merge"
+scenario_length = 201
+
+[eval_parked_cars]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/parked_cars"
+scenario_length = 201
+
+[eval_roundabout]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/roundabout"
+scenario_length = 201
+
+[eval_stopped_traffic]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/stopped_traffic"
+scenario_length = 201
+
+[eval_traffic_light_green]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_green"
+scenario_length = 201
 
-[eval_dense_traffic]
-map_dir = "pufferlib/resources/drive/binaries/dense"
-scenario_length = 91
+[eval_traffic_light_stop]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/traffic_light_stop"
+scenario_length = 201
 
-[eval_obstacles]
-map_dir = "pufferlib/resources/drive/binaries/obstacles"
-scenario_length = 91
+[eval_unprotected_left]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_left"
+scenario_length = 201
 
-[eval_vru_interaction]
-map_dir = "pufferlib/resources/drive/binaries/vru"
-scenario_length = 91
+[eval_unprotected_right]
+map_dir = "/scratch/ev2237/data/nuplan/categories_v021/unprotected_right"
+scenario_length = 201
diff --git a/pufferlib/ocean/drive/drive.h b/pufferlib/ocean/drive/drive.h
@@ -2599,9 +2599,10 @@ static void add_log(Drive *env) {
         env->log.num_waypoints_reached += num_waypoints_reached;
         int num_goals_reached = env->logs[i].num_goals_reached;
         env->log.num_goals_reached += num_goals_reached;
-        // TODO: define better scoring criteria ?
-        // FIXME
-        if (num_goals_reached >= 4 && !agent->removed && !agent->stopped) {
+        // Score: 1 per agent that reached all 3 target waypoints without
+        // being removed/stopped. Was hardcoded to >=4, unreachable given
+        // num_target_waypoints=3 in the ini, so score was always 0.
+        if (num_goals_reached >= 3 && !agent->removed && !agent->stopped) {
             env->log.score += 1.0f;
         }
         if (!offroad && !collided && !red_light_violations && num_waypoints_reached < 1) {

diff --git a/pufferlib/ocean/drive/render.h b/pufferlib/ocean/drive/render.h
@@ -1215,8 +1215,10 @@ void draw_scene(Drive *env, Client *client, int mode, int obs_only, int lasers,
         if (!is_active_agent || agent->sim_valid == 0) {
             continue;
         }
-        if (!IsKeyDown(KEY_LEFT_CONTROL) && obs_only == 0) {
-            // Draw all target waypoints: brightest (first) to darkest (last)
+        if (!IsKeyDown(KEY_LEFT_CONTROL)) {
+            // Draw all target waypoints: brightest (first) to darkest (last).
+            // Drawn in BEV too (obs_only=1) since the goal trail is the main
+            // thing you want to see when labeling scenarios.
             int num_wp = env->num_target_waypoints;
             if (num_wp > MAX_TARGET_WAYPOINTS)
                 num_wp = MAX_TARGET_WAYPOINTS;