diff --git a/rl_code/Main.py b/rl_code/Main.py index 5920078..aefba90 100644 --- a/rl_code/Main.py +++ b/rl_code/Main.py @@ -111,6 +111,7 @@ 'recurrent': config['RECURRENT'], 'attention': config['ATTENTION'], 'neighbors': config['NEIGHBORS'], + 'broadcast': config.get('BROADCAST', False), 'gsp_input_size':config['GSP_INPUT_SIZE'], 'gsp_output_size':config['GSP_OUTPUT_SIZE'], 'gsp_look_back':config['GSP_LOOK_BACK'], @@ -362,6 +363,11 @@ if model.gsp_neighbors: agent_gsp_states = model.make_gsp_states(agent_prox_flags, old_heading_gsp) ctde_gsp = model.choose_agent_gsp(agent_gsp_states, test_mode) + elif model.gsp_broadcast: + # GSP-B: per-agent self-centric view with full-broadcast + # [self_prox, self_prev_gsp, other_i_prox, other_i_prev_gsp, ...] + agent_gsp_states = model.make_gsp_states_broadcast(agent_prox_flags, old_heading_gsp) + ctde_gsp = model.choose_agent_gsp(agent_gsp_states, test_mode) else: ctde_gsp = model.choose_agent_gsp(agent_prox_flags, test_mode) for i in range(Utility.params['num_robots']): @@ -377,20 +383,27 @@ states, state_prox_flags = model.make_gsp_states(old_agent_prox_flags, neighbors_old_heading_gsp, True) new_states = model.make_gsp_states(agent_prox_flags, old_heading_gsp) for i in range(Utility.params['num_robots']): - # print(f'[AGENT] {i} PROX FLAGS:', state_prox_flags[i]) - # only store if state has value if np.sum(state_prox_flags[i]) > 0: - # print(f'[AGENT] {i} Has Value, Storing GSP State: {states[i]}') if model.gsp_networks['learning_scheme'] == 'attention': model.store_gsp_transition(states[i], label, 0, 0, 0) else: - # Under the direct-MSE GSP training path, the 2nd arg - # (action field) carries the supervised target label. - # See GSP-RL fix/gsp-direct-mse-training PR #24 and - # Stelaris docs/research/2026-04-13-gsp-information-collapse-analysis.md. + # 2nd arg = label (supervised target for direct-MSE GSP training) state = states[i] new_state = new_states[i] model.store_gsp_transition(state, label, 0, new_state, 0) + elif model.gsp_broadcast: + # GSP-B per-agent storage with broadcast inputs. + # state_t : broadcast view at previous step (uses neighbors_old_heading_gsp so + # the prev_gsp slot reflects the prediction from the previous tick) + # state_{t+1}: broadcast view at current step + states = model.make_gsp_states_broadcast(old_agent_prox_flags, neighbors_old_heading_gsp) + new_states = model.make_gsp_states_broadcast(agent_prox_flags, old_heading_gsp) + for i in range(Utility.params['num_robots']): + # Gate on self-prox being non-zero so we only store informative transitions, + # matching the GSP and GSP-N branches. Self-prox lives at index 0 under the + # self-first layout. + if states[i][0] != 0: + model.store_gsp_transition(states[i], label, 0, new_states[i], 0) else: for i in range(Utility.params['num_robots']): if model.gsp_networks['learning_scheme'] == 'attention': diff --git a/rl_code/src/agent.py b/rl_code/src/agent.py index 337ee28..7ac8312 100644 --- a/rl_code/src/agent.py +++ b/rl_code/src/agent.py @@ -32,14 +32,25 @@ def __init__( gsp_min_max_action: float, gsp_look_back: int, gsp_sequence_length: int, + broadcast: bool = False, prox_filter_angle_deg: float = 45.0, n_hop_neighbors: int = 1, ): + if neighbors and broadcast: + raise ValueError( + "GSP variants neighbors=True and broadcast=True are mutually exclusive — " + "they overload gsp_input_size differently. Pick one." + ) if neighbors: # 2 inputs from ownship (prev_gsp, avg_prox) # 2 inputs from each neighbor (prev_gsp, avg_prox) # 2*n_hop_neighbors for symmetry in both CW and CCW - gsp_input_size = 2+2*(n_hop_neighbors*2) + gsp_input_size = 2+2*(n_hop_neighbors*2) + if broadcast: + # GSP-B: each agent's view is (self_prox, self_prev_gsp) + (other_prox, other_prev_gsp) + # for all (n_agents - 1) other agents. Total 2*n_agents. Known limitation: + # coupled to team size, not transferable across num_robots. + gsp_input_size = 2 * n_agents output_size = n_actions if network in ['DQN', 'DDQN']: @@ -68,13 +79,16 @@ def __init__( self._network = network self._n_actions = n_actions self._neighbors = neighbors + self._broadcast = broadcast self._n_hop_neighbors = n_hop_neighbors self.neighbors_dict = {} self._options_per_action = options_per_action self._prox_filter_angle_deg = prox_filter_angle_deg - if self._neighbors: + if self._neighbors or self._broadcast: + # Per-agent observation ring buffers: GSP-N and GSP-B both produce + # per-agent self-centric views, so each agent has its own history. self.gsp_observation = [] for _ in range(self._n_agents): self.gsp_observation.append([[0 for _ in range(self.gsp_network_input)] for _ in range(self.gsp_sequence_length)]) @@ -98,6 +112,10 @@ def __init__( def gsp_neighbors(self): return self._neighbors + @property + def gsp_broadcast(self): + return self._broadcast + @property def n_agents(self): return self._n_agents @@ -155,6 +173,40 @@ def make_agent_state(self, env_obs, heading_gsp=None, global_knowledge=None): env_obs = np.concatenate((env_obs, global_knowledge)) return env_obs + def make_gsp_states_broadcast(self, agent_prox_values, agent_prev_gsp): + """Build per-agent GSP inputs for GSP-B (full-broadcast variant). + + Each agent's view is self-first: [self_prox, self_prev_gsp, other_0_prox, + other_0_prev_gsp, other_1_prox, other_1_prev_gsp, ..., other_{n-1}_prox, + other_{n-1}_prev_gsp]. "other" iterates all agents in ascending id order, + skipping self. Total length = 2 * n_agents. + + Known limitation: the network input size is coupled to n_agents, so a + trained GSP-B policy does not transfer to teams of different size. This + is the tradeoff vs GSP-N, which uses fixed (self + n_hop_neighbors * 2) + inputs and transfers across team sizes. + """ + states = [] + for agent in range(self._n_agents): + agent_state = np.zeros(self.gsp_network_input) + # Self first + agent_state[0] = agent_prox_values[agent] + agent_state[1] = agent_prev_gsp[agent] + i = 2 + # Then every other agent in ascending id order, skipping self + for other in range(self._n_agents): + if other == agent: + continue + agent_state[i] = agent_prox_values[other] + agent_state[i + 1] = agent_prev_gsp[other] + i += 2 + # Maintain gsp_observation ring buffer the same way make_gsp_states does, + # so recurrent/attention variants can still see sequences if added later. + self.gsp_observation[agent].pop(0) + self.gsp_observation[agent].append(agent_state) + states.append(agent_state) + return states + def make_gsp_states(self, agent_prox_values, agent_prev_gsp, return_prox_flags = False): states = [] prox_flags = [] @@ -242,7 +294,11 @@ def choose_agent_action(self, observation, failures, test=False): return actions, action_num def choose_agent_gsp(self, agent_gsp_states, test = False): - if self._neighbors: + if self._neighbors or self._broadcast: + # Per-agent predictions with self-centric inputs. GSP-N (neighbors) + # and GSP-B (broadcast) share the same per-agent forward-pass shape; + # only the input vector differs. Non-recurrent broadcast uses the + # same stateless path as non-recurrent neighbors. actions = [] for i in range(self._n_agents): if self.recurrent_gsp: @@ -257,7 +313,7 @@ def choose_agent_gsp(self, agent_gsp_states, test = False): ) # Take the last timestep's action actions.append(action_tensor[-1].cpu().detach().numpy()) - else: + else: actions.append(self.choose_action(agent_gsp_states[i], self.gsp_networks, test)) return actions else: diff --git a/tests/test_agent/test_gsp_broadcast.py b/tests/test_agent/test_gsp_broadcast.py new file mode 100644 index 0000000..2a3ca58 --- /dev/null +++ b/tests/test_agent/test_gsp_broadcast.py @@ -0,0 +1,123 @@ +"""Tests for GSP-B (full-broadcast variant) state construction. + +GSP-B: each agent's input is [self_prox, self_prev_gsp, other_0_prox, +other_0_prev_gsp, other_1_prox, other_1_prev_gsp, ..., other_{n-1}_prox, +other_{n-1}_prev_gsp], length 2*n_agents. Self-first ordering. + +Known limitation (inherited from plain GSP): the network input size is +coupled to n_agents, so a trained GSP-B policy only transfers to the same +team size. This is the tradeoff vs GSP-N's fixed (self + n_hop_neighbors) +input which transfers across team sizes. +""" + +import numpy as np +import pytest + +from src.agent import Agent + + +BASE_CONFIG = { + "GAMMA": 0.99, "TAU": 0.005, "ALPHA": 0.001, "BETA": 0.002, "LR": 0.0001, + "EPSILON": 0.0, "EPS_MIN": 0.0, "EPS_DEC": 0.0, + "BATCH_SIZE": 16, "MEM_SIZE": 1000, "REPLACE_TARGET_COUNTER": 10, + "NOISE": 0.0, "UPDATE_ACTOR_ITER": 1, "WARMUP": 0, + "GSP_LEARNING_FREQUENCY": 1, "GSP_BATCH_SIZE": 16, +} + + +def make_agent(n_agents=4, network="DDQN", broadcast=True): + return Agent( + config=BASE_CONFIG, + network=network, + n_agents=n_agents, + n_obs=8, + n_actions=4, + options_per_action=3, + id=0, + min_max_action=1.0, + meta_param_size=1, + gsp=True, + recurrent=False, + attention=False, + neighbors=False, + broadcast=broadcast, + gsp_input_size=4, # overridden when broadcast=True + gsp_output_size=1, + gsp_min_max_action=1.0, + gsp_look_back=2, + gsp_sequence_length=5, + ) + + +def test_broadcast_agent_has_gsp_broadcast_property_true(): + agent = make_agent() + assert agent.gsp_broadcast is True + + +def test_broadcast_agent_gsp_input_size_is_two_times_n_agents(): + """For 4 agents, the broadcast input is [self_prox, self_prev_gsp, +3×(prox, prev_gsp)] = 8.""" + agent = make_agent(n_agents=4) + assert agent.gsp_network_input == 8 + + +def test_broadcast_agent_gsp_input_size_scales_with_n_agents(): + """For 8 agents, input is 16. Known limitation: coupled to team size.""" + agent = make_agent(n_agents=8) + assert agent.gsp_network_input == 16 + + +def test_make_gsp_states_broadcast_returns_one_state_per_agent(): + agent = make_agent(n_agents=4) + prox = [0.1, 0.2, 0.3, 0.4] + prev_gsp = [-0.5, 0.0, 0.25, 0.75] + states = agent.make_gsp_states_broadcast(prox, prev_gsp) + assert len(states) == 4 + for s in states: + assert len(s) == 8 + + +def test_make_gsp_states_broadcast_self_first_ordering(): + """For each agent i, the first two entries must be (prox[i], prev_gsp[i]).""" + agent = make_agent(n_agents=4) + prox = [0.11, 0.22, 0.33, 0.44] + prev_gsp = [-0.1, -0.2, -0.3, -0.4] + states = agent.make_gsp_states_broadcast(prox, prev_gsp) + for i in range(4): + assert states[i][0] == pytest.approx(prox[i]), f"agent {i} self_prox" + assert states[i][1] == pytest.approx(prev_gsp[i]), f"agent {i} self_prev_gsp" + + +def test_make_gsp_states_broadcast_others_in_order(): + """After the self-pair, the remaining entries are other agents in ascending id order (skipping self).""" + agent = make_agent(n_agents=4) + prox = [0.10, 0.20, 0.30, 0.40] + prev_gsp = [0.01, 0.02, 0.03, 0.04] + states = agent.make_gsp_states_broadcast(prox, prev_gsp) + # Agent 0: self=0, others=[1, 2, 3] + assert list(states[0]) == pytest.approx([0.10, 0.01, 0.20, 0.02, 0.30, 0.03, 0.40, 0.04]) + # Agent 2: self=2, others=[0, 1, 3] + assert list(states[2]) == pytest.approx([0.30, 0.03, 0.10, 0.01, 0.20, 0.02, 0.40, 0.04]) + # Agent 3: self=3, others=[0, 1, 2] + assert list(states[3]) == pytest.approx([0.40, 0.04, 0.10, 0.01, 0.20, 0.02, 0.30, 0.03]) + + +def test_broadcast_is_mutually_exclusive_with_neighbors(): + """Can't have both neighbors=True and broadcast=True; they overload gsp_input_size.""" + with pytest.raises((ValueError, AssertionError)): + Agent( + config=BASE_CONFIG, + network="DDQN", n_agents=4, n_obs=8, n_actions=4, + options_per_action=3, id=0, min_max_action=1.0, meta_param_size=1, + gsp=True, recurrent=False, attention=False, + neighbors=True, broadcast=True, + gsp_input_size=4, gsp_output_size=1, + gsp_min_max_action=1.0, gsp_look_back=2, gsp_sequence_length=5, + ) + + +def test_plain_gsp_without_broadcast_unchanged(): + """Plain GSP (neighbors=False, broadcast=False) keeps the legacy input size.""" + agent = make_agent(broadcast=False) + # Should fall through to the config-provided gsp_input_size=4 + assert agent.gsp_network_input == 4 + assert agent.gsp_broadcast is False