From 59343aa7d4f5bbf54d64f139fe535c03e4b652fe Mon Sep 17 00:00:00 2001 From: Joshua Bloom Date: Mon, 13 Apr 2026 13:46:00 -0400 Subject: [PATCH] feat(gsp): filter replay-buffer stores by per-robot force magnitude MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds GSP_STORE_FORCE_THRESHOLD config knob. When set, Main.py only stores a GSP transition for robot i if stats[i][0] (force_magnitude) exceeds the threshold. Default 0.0 preserves legacy behavior (all transitions with prox activity get stored). Why: the live 6-config DDQN variant batch showed that every GSP variant (plain GSP, GSP-B, GSP-N, R-GSP-N, A-GSP-N) converges to zero correlation with the delta-theta label within 100-250 episodes. A direct linear-R² diagnostic on the captured HDF5 data showed why: full distribution → linear R² ceiling = 7.2% force_magnitude >p75 → linear R² ceiling = 24.6% force_magnitude >p90 → linear R² ceiling = 27.4% force_magnitude >p95 → linear R² ceiling = 30.0% The 90% of timesteps with near-zero force contain near-zero signal and drown the 10% of informative samples in the replay buffer. Even a perfect supervised regressor can't beat var(label) when 90% of training samples are (uninformative state, noise-dominated label) pairs. Filtering concentrates the training distribution on the timesteps where robots are actually interacting with obstacles and the payload is actually rotating in response — a ~4× lift in ceiling R². Implementation notes: - Filter applies uniformly to all GSP variants (plain, GSP-B, GSP-N, R-GSP-N, A-GSP-N). The existing prox-activity guards are preserved and ANDed with the new force guard. - Default threshold 0.0 means the filter is disabled unless explicitly enabled via config. Recommended starting value: 4.0 (≈ p75 of force_magnitude in 2-obstacle runs). - stats[i][0] is per-robot force magnitude, already parsed from ZMQ at the top of the episode step loop — no extra I/O cost. Companion: Stelaris launcher will need a parallel wiring so experiments can set GSP_STORE_FORCE_THRESHOLD via the matrix YAML. Co-Authored-By: Claude Opus 4.6 (1M context) --- rl_code/Main.py | 40 ++++++++++++++----------------------- run_baseline_experiments.py | 9 +++++++++ 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/rl_code/Main.py b/rl_code/Main.py index aefba90..e851655 100644 --- a/rl_code/Main.py +++ b/rl_code/Main.py @@ -378,51 +378,41 @@ # print("-------------------------------------------------") # print('[GSP]', next_heading_gsp) - # Store GSP Transition + # Store GSP Transition — guard by per-robot force magnitude. + # GSP_STORE_FORCE_THRESHOLD concentrates training on samples where + # the robot is actively applying force (top ~25% of samples at + # threshold ~4.0), which multiplies the linear-R² ceiling of the + # prediction problem 3–4× (see + # docs/research/2026-04-13-gsp-ddpg-vs-attention-collapse.md). + # 0.0 = filter disabled (legacy behavior). + force_thr = float(config.get('GSP_STORE_FORCE_THRESHOLD', 0.0)) if model.gsp_neighbors: states, state_prox_flags = model.make_gsp_states(old_agent_prox_flags, neighbors_old_heading_gsp, True) new_states = model.make_gsp_states(agent_prox_flags, old_heading_gsp) for i in range(Utility.params['num_robots']): - if np.sum(state_prox_flags[i]) > 0: + if np.sum(state_prox_flags[i]) > 0 and stats[i][0] > force_thr: if model.gsp_networks['learning_scheme'] == 'attention': model.store_gsp_transition(states[i], label, 0, 0, 0) else: - # 2nd arg = label (supervised target for direct-MSE GSP training) state = states[i] new_state = new_states[i] model.store_gsp_transition(state, label, 0, new_state, 0) elif model.gsp_broadcast: - # GSP-B per-agent storage with broadcast inputs. - # state_t : broadcast view at previous step (uses neighbors_old_heading_gsp so - # the prev_gsp slot reflects the prediction from the previous tick) - # state_{t+1}: broadcast view at current step states = model.make_gsp_states_broadcast(old_agent_prox_flags, neighbors_old_heading_gsp) new_states = model.make_gsp_states_broadcast(agent_prox_flags, old_heading_gsp) for i in range(Utility.params['num_robots']): - # Gate on self-prox being non-zero so we only store informative transitions, - # matching the GSP and GSP-N branches. Self-prox lives at index 0 under the - # self-first layout. - if states[i][0] != 0: + if states[i][0] != 0 and stats[i][0] > force_thr: model.store_gsp_transition(states[i], label, 0, new_states[i], 0) else: for i in range(Utility.params['num_robots']): - if model.gsp_networks['learning_scheme'] == 'attention': - state = np.array(old_agent_prox_flags) - # only store the state if it has value - if np.sum(state) > 0: + state = np.array(old_agent_prox_flags) + if np.sum(state) > 0 and stats[i][0] > force_thr: + if model.gsp_networks['learning_scheme'] == 'attention': model.store_gsp_transition(state, label, 0, 0, 0) - elif args.independent_learning: - state = np.array(old_agent_prox_flags) - # only store the state if it has value - if np.sum(state) > 0: - # 2nd arg = label (supervised target for direct-MSE GSP training) + elif args.independent_learning: new_state = np.array(agent_prox_flags) models[i].store_gsp_transition(state, label, 0, new_state, 0) - else: - state = np.array(old_agent_prox_flags) - # only store the state if it has value - if np.sum(state) > 0: - # 2nd arg = label (supervised target for direct-MSE GSP training) + else: new_state = np.array(agent_prox_flags) model.store_gsp_transition(state, label, 0, new_state, 0) diff --git a/run_baseline_experiments.py b/run_baseline_experiments.py index e8bc48c..84ea9b4 100644 --- a/run_baseline_experiments.py +++ b/run_baseline_experiments.py @@ -162,6 +162,15 @@ def make_config(exp_name, gsp, neighbors, num_obstacles, use_gate, gate_curricul "GSP_LEARNING_FREQUENCY": 4, "LEARN_EVERY": 4, "GSP_BATCH_SIZE": 256, + # Per-robot force_magnitude threshold for GSP replay buffer store filter. + # 0.0 = disabled (store every transition with prox activity, legacy behavior). + # > 0 = only store transitions where stats[i][0] (force_magnitude) exceeds + # the threshold. This concentrates GSP training on samples where the robot + # is actively applying force, which empirically multiplies the linear-R² + # ceiling of the prediction problem 3–4× (see + # docs/research/2026-04-13-gsp-ddpg-vs-attention-collapse.md in Stelaris). + # Recommended starting point: ~4.0 (≈ p75 of force_magnitude in 2-obstacle runs). + "GSP_STORE_FORCE_THRESHOLD": 0.0, }