From 59343aa7d4f5bbf54d64f139fe535c03e4b652fe Mon Sep 17 00:00:00 2001
From: Joshua Bloom <joshuabloom@mac.mynetworksettings.com>
Date: Mon, 13 Apr 2026 13:46:00 -0400
Subject: [PATCH] feat(gsp): filter replay-buffer stores by per-robot force
 magnitude
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds GSP_STORE_FORCE_THRESHOLD config knob. When set, Main.py only stores
a GSP transition for robot i if stats[i][0] (force_magnitude) exceeds the
threshold. Default 0.0 preserves legacy behavior (all transitions with
prox activity get stored).

Why: the live 6-config DDQN variant batch showed that every GSP variant
(plain GSP, GSP-B, GSP-N, R-GSP-N, A-GSP-N) converges to zero correlation
with the delta-theta label within 100-250 episodes. A direct linear-R²
diagnostic on the captured HDF5 data showed why:

  full distribution    → linear R² ceiling = 7.2%
  force_magnitude >p75 → linear R² ceiling = 24.6%
  force_magnitude >p90 → linear R² ceiling = 27.4%
  force_magnitude >p95 → linear R² ceiling = 30.0%

The 90% of timesteps with near-zero force contain near-zero signal and
drown the 10% of informative samples in the replay buffer. Even a
perfect supervised regressor can't beat var(label) when 90% of training
samples are (uninformative state, noise-dominated label) pairs.

Filtering concentrates the training distribution on the timesteps where
robots are actually interacting with obstacles and the payload is
actually rotating in response — a ~4× lift in ceiling R².

Implementation notes:
- Filter applies uniformly to all GSP variants (plain, GSP-B, GSP-N,
  R-GSP-N, A-GSP-N). The existing prox-activity guards are preserved
  and ANDed with the new force guard.
- Default threshold 0.0 means the filter is disabled unless explicitly
  enabled via config. Recommended starting value: 4.0 (≈ p75 of
  force_magnitude in 2-obstacle runs).
- stats[i][0] is per-robot force magnitude, already parsed from ZMQ
  at the top of the episode step loop — no extra I/O cost.

Companion: Stelaris launcher will need a parallel wiring so experiments
can set GSP_STORE_FORCE_THRESHOLD via the matrix YAML.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 rl_code/Main.py             | 40 ++++++++++++++-----------------------
 run_baseline_experiments.py |  9 +++++++++
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/rl_code/Main.py b/rl_code/Main.py
index aefba90..e851655 100644
--- a/rl_code/Main.py
+++ b/rl_code/Main.py
@@ -378,51 +378,41 @@
                         # print("-------------------------------------------------")
                         # print('[GSP]', next_heading_gsp)
 
-                        # Store GSP Transition
+                        # Store GSP Transition — guard by per-robot force magnitude.
+                        # GSP_STORE_FORCE_THRESHOLD concentrates training on samples where
+                        # the robot is actively applying force (top ~25% of samples at
+                        # threshold ~4.0), which multiplies the linear-R² ceiling of the
+                        # prediction problem 3–4× (see
+                        # docs/research/2026-04-13-gsp-ddpg-vs-attention-collapse.md).
+                        # 0.0 = filter disabled (legacy behavior).
+                        force_thr = float(config.get('GSP_STORE_FORCE_THRESHOLD', 0.0))
                         if model.gsp_neighbors:
                             states, state_prox_flags = model.make_gsp_states(old_agent_prox_flags, neighbors_old_heading_gsp, True)
                             new_states = model.make_gsp_states(agent_prox_flags, old_heading_gsp)
                             for i in range(Utility.params['num_robots']):
-                                if np.sum(state_prox_flags[i]) > 0:
+                                if np.sum(state_prox_flags[i]) > 0 and stats[i][0] > force_thr:
                                     if model.gsp_networks['learning_scheme'] == 'attention':
                                         model.store_gsp_transition(states[i], label, 0, 0, 0)
                                     else:
-                                        # 2nd arg = label (supervised target for direct-MSE GSP training)
                                         state = states[i]
                                         new_state = new_states[i]
                                         model.store_gsp_transition(state, label, 0, new_state, 0)
                         elif model.gsp_broadcast:
-                            # GSP-B per-agent storage with broadcast inputs.
-                            # state_t : broadcast view at previous step (uses neighbors_old_heading_gsp so
-                            #            the prev_gsp slot reflects the prediction from the previous tick)
-                            # state_{t+1}: broadcast view at current step
                             states = model.make_gsp_states_broadcast(old_agent_prox_flags, neighbors_old_heading_gsp)
                             new_states = model.make_gsp_states_broadcast(agent_prox_flags, old_heading_gsp)
                             for i in range(Utility.params['num_robots']):
-                                # Gate on self-prox being non-zero so we only store informative transitions,
-                                # matching the GSP and GSP-N branches. Self-prox lives at index 0 under the
-                                # self-first layout.
-                                if states[i][0] != 0:
+                                if states[i][0] != 0 and stats[i][0] > force_thr:
                                     model.store_gsp_transition(states[i], label, 0, new_states[i], 0)
                         else:
                             for i in range(Utility.params['num_robots']):
-                                if model.gsp_networks['learning_scheme'] == 'attention':
-                                    state = np.array(old_agent_prox_flags)
-                                    # only store the state if it has value
-                                    if np.sum(state) > 0:
+                                state = np.array(old_agent_prox_flags)
+                                if np.sum(state) > 0 and stats[i][0] > force_thr:
+                                    if model.gsp_networks['learning_scheme'] == 'attention':
                                         model.store_gsp_transition(state, label, 0, 0, 0)
-                                elif args.independent_learning:
-                                    state = np.array(old_agent_prox_flags)
-                                    # only store the state if it has value
-                                    if np.sum(state) > 0:
-                                        # 2nd arg = label (supervised target for direct-MSE GSP training)
+                                    elif args.independent_learning:
                                         new_state = np.array(agent_prox_flags)
                                         models[i].store_gsp_transition(state, label, 0, new_state, 0)
-                                else:
-                                    state = np.array(old_agent_prox_flags)
-                                    # only store the state if it has value
-                                    if np.sum(state) > 0:
-                                        # 2nd arg = label (supervised target for direct-MSE GSP training)
+                                    else:
                                         new_state = np.array(agent_prox_flags)
                                         model.store_gsp_transition(state, label, 0, new_state, 0)
 
diff --git a/run_baseline_experiments.py b/run_baseline_experiments.py
index e8bc48c..84ea9b4 100644
--- a/run_baseline_experiments.py
+++ b/run_baseline_experiments.py
@@ -162,6 +162,15 @@ def make_config(exp_name, gsp, neighbors, num_obstacles, use_gate, gate_curricul
         "GSP_LEARNING_FREQUENCY": 4,
         "LEARN_EVERY": 4,
         "GSP_BATCH_SIZE": 256,
+        # Per-robot force_magnitude threshold for GSP replay buffer store filter.
+        # 0.0 = disabled (store every transition with prox activity, legacy behavior).
+        # > 0 = only store transitions where stats[i][0] (force_magnitude) exceeds
+        # the threshold. This concentrates GSP training on samples where the robot
+        # is actively applying force, which empirically multiplies the linear-R²
+        # ceiling of the prediction problem 3–4× (see
+        # docs/research/2026-04-13-gsp-ddpg-vs-attention-collapse.md in Stelaris).
+        # Recommended starting point: ~4.0 (≈ p75 of force_magnitude in 2-obstacle runs).
+        "GSP_STORE_FORCE_THRESHOLD": 0.0,
     }