From 94a398d6fc815bbeba1be5ed73ba3fe5fb037014 Mon Sep 17 00:00:00 2001 From: Dhruv Madhok Date: Tue, 26 May 2026 15:06:21 -0700 Subject: [PATCH] feat: register_resource with parent edge for agent_api_keys dual-write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the parent_agent cascade gap surfaced on scaleapi/agentex#354. The api_key dual-write (AGX1-272, PR #248) currently calls grant() which writes the owner edge in SpiceDB but NOT the parent_agent edge. The agent_api_key schema requires `read = ... & parent_agent->read & ...`, so every downstream read/update fails closed without that edge. This PR adds register_resource/deregister_resource (Port + adapter + service) and swaps the api_keys use case from grant→register_resource with parent=AgentexResource.agent(agent_id). Now the owner edge and parent_agent edge are written atomically. Stack: - scaleapi/scaleapi#144657 — sgp-authz 0.7.0 (parent_resource kwarg). - scaleapi/agentex#355 — agentex-auth Port + adapter + HTTP routes. - scaleapi/scale-agentex#248 — original AGX1-272 dual-write (this stacks on it). - THIS PR — extends #248 to use the parent-aware path. Changes: - Port: abstract register_resource(resource, parent=None) and deregister_resource(resource). - Adapter proxy: POST /v1/authz/register and /v1/authz/deregister. - Service: mirror existing grant/revoke pattern (principal_context override, _bypass support, parent in log line for cascade debugging). - Use case: swap grant→register_resource passing parent=agent; swap revoke→deregister_resource. except Exception wrappers preserved (fail-closed on register, best-effort on deregister). - Tests: rename mocks to register_resource/deregister_resource; assert the parent edge is passed correctly. Test plan: - pytest agentex/tests/integration/services/test_agent_api_key_service_dual_write.py → 8 / 8 pass. - New test ``test_create_api_key_calls_grant_when_flag_on`` asserts parent.type == AgentexResourceType.agent and parent.selector == agent.id. Other resource types' grant→register_resource swap is out of scope. Co-Authored-By: Claude Opus 4.7 --- .../adapter_agentex_authz_proxy.py | 28 +++++ agentex/src/adapters/authorization/port.py | 26 +++++ .../domain/services/authorization_service.py | 57 ++++++++++ .../use_cases/agent_api_keys_use_case.py | 70 ++++++++---- .../test_agent_api_key_service_dual_write.py | 100 ++++++++++-------- 5 files changed, 215 insertions(+), 66 deletions(-) diff --git a/agentex/src/adapters/authorization/adapter_agentex_authz_proxy.py b/agentex/src/adapters/authorization/adapter_agentex_authz_proxy.py index 0e44479d..c8ba4f3a 100644 --- a/agentex/src/adapters/authorization/adapter_agentex_authz_proxy.py +++ b/agentex/src/adapters/authorization/adapter_agentex_authz_proxy.py @@ -85,6 +85,34 @@ async def list_resources( ) return response["items"] + async def register_resource( + self, + principal: AgentexAuthPrincipalContext, + resource: AgentexResource, + parent: AgentexResource | None = None, + ) -> None: + payload = { + "principal": principal, + "resource": resource.model_dump(), + "parent": parent.model_dump() if parent is not None else None, + } + await HttpRequestHandler.post_with_error_handling( + self.agentex_auth_url, "/v1/authz/register", json=payload + ) + + async def deregister_resource( + self, + principal: AgentexAuthPrincipalContext, + resource: AgentexResource, + ) -> None: + payload = { + "principal": principal, + "resource": resource.model_dump(), + } + await HttpRequestHandler.post_with_error_handling( + self.agentex_auth_url, "/v1/authz/deregister", json=payload + ) + DAgentexAuthorization = Annotated[ AgentexAuthorizationProxy, Depends(AgentexAuthorizationProxy) diff --git a/agentex/src/adapters/authorization/port.py b/agentex/src/adapters/authorization/port.py index 80c05200..f4f33a0a 100644 --- a/agentex/src/adapters/authorization/port.py +++ b/agentex/src/adapters/authorization/port.py @@ -49,3 +49,29 @@ async def list_resources( filter_operation: AuthorizedOperationType = AuthorizedOperationType.read, ) -> Iterable[str]: """List resource_ids for a given principal""" + + @abstractmethod + async def register_resource( + self, + principal: PrincipalT, + resource: AgentexResource, + parent: AgentexResource | None = None, + ) -> None: + """Register a newly created resource in SpiceDB with the principal as + owner. Optionally writes a lifecycle parent edge. + + Use this on resource create instead of ``grant`` when the resource + type's SpiceDB definition has a parent relation that permission + checks cascade through (e.g. ``agent_api_key`` declares + ``parent_agent``). Without writing that edge here the cascade fails + closed. + """ + + @abstractmethod + async def deregister_resource( + self, + principal: PrincipalT, + resource: AgentexResource, + ) -> None: + """Deregister a deleted resource and all of its relationships + (owner, parent, grantees) in a single atomic call.""" diff --git a/agentex/src/domain/services/authorization_service.py b/agentex/src/domain/services/authorization_service.py index 8400603b..06125d57 100644 --- a/agentex/src/domain/services/authorization_service.py +++ b/agentex/src/domain/services/authorization_service.py @@ -188,5 +188,62 @@ async def list_resources( ) return result + async def register_resource( + self, + resource: AgentexResource, + parent: AgentexResource | None = None, + *, + principal_context=..., + ) -> None: + """Register a newly created resource with the principal as owner. + + Prefer this over ``grant`` when the resource's SpiceDB schema has + a parent relation that permissions cascade through (e.g. + ``agent_api_key`` declares ``parent_agent``). Pass ``parent`` to + link the child to its parent atomically; without it the cascade + fails closed. + """ + if self._bypass(): + logger.info(f"Authorization bypassed for register_resource on {resource}") + return None + + effective_principal = ( + principal_context + if principal_context is not ... + else self.principal_context + ) + logger.info( + "[authorization_service] Registering %s:%s for principal %s (parent=%s)", + resource.type, + resource.selector, + effective_principal, + f"{parent.type}:{parent.selector}" if parent is not None else None, + ) + await self.gateway.register_resource(effective_principal, resource, parent) + + async def deregister_resource( + self, + resource: AgentexResource, + *, + principal_context=..., + ) -> None: + """Deregister a deleted resource and all of its relationships.""" + if self._bypass(): + logger.info(f"Authorization bypassed for deregister_resource on {resource}") + return None + + effective_principal = ( + principal_context + if principal_context is not ... + else self.principal_context + ) + logger.info( + "[authorization_service] Deregistering %s:%s for principal %s", + resource.type, + resource.selector, + effective_principal, + ) + await self.gateway.deregister_resource(effective_principal, resource) + DAuthorizationService = Annotated[AuthorizationService, Depends(AuthorizationService)] diff --git a/agentex/src/domain/use_cases/agent_api_keys_use_case.py b/agentex/src/domain/use_cases/agent_api_keys_use_case.py index 8b01167c..0f898056 100644 --- a/agentex/src/domain/use_cases/agent_api_keys_use_case.py +++ b/agentex/src/domain/use_cases/agent_api_keys_use_case.py @@ -135,23 +135,24 @@ async def _register_api_key_in_spark_authz( creator_user_id: str | None, creator_service_account_id: str | None, ) -> str | None: - """Register a new agent_api_key in Spark AuthZ with creator as owner. + """Register a new agent_api_key in Spark AuthZ with creator as owner + AND the parent_agent edge to the owning agent. Called BEFORE the Postgres write — a failure raises and prevents the - row from being persisted, so there is no compensating action to take. - Mirrors the dual-write pattern used for tasks (AGX1-274). - - The current ``Provider.spark`` adapter returns ``{}`` from ``grant``; - no ZedToken is surfaced today, so we always return ``None`` for the - new-write-isolation column. A follow-up will plumb the token through + row from being persisted, so there is no compensating action. + + The ``agent_api_key`` SpiceDB schema has a ``parent_agent`` relation + that read/update/delete permissions cascade through: + ``permission read = internal_effective_viewer & parent_agent->read & + internal_tenant_gate``. We MUST write the parent edge here or every + downstream permission check fails closed. ``register_resource`` + (added in agentex-auth and sgp-authz 0.7.0) writes both the owner + edge and the parent edge atomically in one round-trip. + + The current ``register_resource`` returns ``None``; no ZedToken is + surfaced today, so we always return ``None`` for the + spark_authz_zedtoken column. A follow-up will plumb the token through once the adapter exposes it. - - Note: the ``agent_api_key`` SpiceDB schema has a ``parent_agent`` - relation that read/delete permissions cascade through. The current - ``AuthorizationGateway.grant`` signature does not accept a parent - relation — the agentex-auth adapter is expected to set - ``parent_agent`` based on the resource shape. This is the same - gap Asher's task PR has and is tracked as a follow-up. """ if creator_user_id is None and creator_service_account_id is None: logger.warning( @@ -163,16 +164,36 @@ async def _register_api_key_in_spark_authz( }, ) return None - await self.authorization_service.grant( - resource=AgentexResource.api_key(api_key_id), - ) + try: + await self.authorization_service.register_resource( + resource=AgentexResource.api_key(api_key_id), + parent=AgentexResource.agent(agent_id), + ) + except Exception as exc: + # Fail closed: log + re-raise so the Postgres row is never written. + # The dual-write contract requires the SpiceDB tuple (and parent + # edge) to exist before the row does. + logger.exception( + "Spark AuthZ register_resource failed for agent_api_key; aborting create", + extra={ + "api_key_id": api_key_id, + "agent_id": agent_id, + "account_id": account_id, + "creator_user_id": creator_user_id, + "creator_service_account_id": creator_service_account_id, + "error_type": type(exc).__name__, + }, + ) + raise return None async def _deregister_api_key_from_spark_authz( self, *, api_key_id: str, account_id: str | None ) -> None: - """Best-effort revocation of an api_key's Spark AuthZ tuples on delete. + """Best-effort deregistration of an api_key's Spark AuthZ tuples on delete. + ``deregister_resource`` removes the resource and all of its + relationships (owner, parent_agent, any grantees) atomically. Only invoked when the FGAC_AGENT_API_KEYS_DUAL_WRITE flag is enabled for the caller's account. Failures are logged but do not block the delete. @@ -182,13 +203,18 @@ async def _deregister_api_key_from_spark_authz( ): return try: - await self.authorization_service.revoke( + await self.authorization_service.deregister_resource( resource=AgentexResource.api_key(api_key_id), ) - except Exception: + except Exception as exc: + # Best-effort: log and continue. Postgres row already deleted. logger.warning( - "Spark AuthZ revoke failed for agent_api_key", - extra={"api_key_id": api_key_id, "account_id": account_id}, + "Spark AuthZ deregister failed for agent_api_key; tuple may be orphaned", + extra={ + "api_key_id": api_key_id, + "account_id": account_id, + "error_type": type(exc).__name__, + }, exc_info=True, ) diff --git a/agentex/tests/integration/services/test_agent_api_key_service_dual_write.py b/agentex/tests/integration/services/test_agent_api_key_service_dual_write.py index 89cdcc66..f29bed4f 100644 --- a/agentex/tests/integration/services/test_agent_api_key_service_dual_write.py +++ b/agentex/tests/integration/services/test_agent_api_key_service_dual_write.py @@ -2,17 +2,19 @@ These cover the AGX1-272 dual-write path: -- Flag OFF: ``authorization_service.grant`` is NOT called and the api_key is - written to the repository with creator metadata populated from the - principal context. -- Flag ON: ``grant`` is called with ``AgentexResource.api_key()`` and the - row is written. -- Delete deregisters: ``revoke`` is called when ``delete`` runs under the flag. -- Spark failure prevents row: when ``grant`` raises, the api_key is NOT - persisted. -- Revoke failure does not block delete: when ``revoke`` raises, the DB - delete still completes and the failure is logged. -- No creator → no grant: if neither user_id nor service_account_id is +- Flag OFF: ``authorization_service.register_resource`` is NOT called and + the api_key is written to the repository with creator metadata populated + from the principal context. +- Flag ON: ``register_resource`` is called with ``AgentexResource.api_key()`` + and ``parent=AgentexResource.agent()`` (the parent edge is + load-bearing for cascade), and the row is written. +- Delete deregisters: ``deregister_resource`` is called when ``delete`` runs + under the flag. +- Spark failure prevents row: when ``register_resource`` raises, the api_key + is NOT persisted. +- Deregister failure does not block delete: when ``deregister_resource`` + raises, the DB delete still completes and the failure is logged. +- No creator → no register: if neither user_id nor service_account_id is resolvable, the dual-write is a no-op (logged) and the row still lands. The tests intentionally mock the repository, authorization service, agent @@ -62,8 +64,8 @@ def _build_use_case( *, flag_accounts: str, principal: SimpleNamespace | None, - grant: AsyncMock | None = None, - revoke: AsyncMock | None = None, + register_resource: AsyncMock | None = None, + deregister_resource: AsyncMock | None = None, agent: AgentEntity | None = None, create_raises: Exception | None = None, monkeypatch: pytest.MonkeyPatch, @@ -94,8 +96,12 @@ def _build_use_case( authorization_service = Mock() authorization_service.principal_context = principal - authorization_service.grant = grant or AsyncMock(return_value={}) - authorization_service.revoke = revoke or AsyncMock(return_value=None) + authorization_service.register_resource = register_resource or AsyncMock( + return_value=None + ) + authorization_service.deregister_resource = deregister_resource or AsyncMock( + return_value=None + ) feature_flags = FeatureFlagProvider() @@ -115,8 +121,8 @@ def _build_use_case( return ( use_case, agent_api_key_repository, - authorization_service.grant, - authorization_service.revoke, + authorization_service.register_resource, + authorization_service.deregister_resource, ) @@ -126,7 +132,7 @@ async def test_create_api_key_skips_grant_when_flag_off( monkeypatch: pytest.MonkeyPatch, ) -> None: agent = _agent() - use_case, repo, grant, _ = _build_use_case( + use_case, repo, register, _ = _build_use_case( flag_accounts="", principal=_principal(user_id="user-A", account_id="acct-1"), agent=agent, @@ -141,7 +147,7 @@ async def test_create_api_key_skips_grant_when_flag_off( account_id="acct-1", ) - grant.assert_not_called() + register.assert_not_called() repo.create.assert_awaited_once() assert api_key.creator_user_id == "user-A" assert api_key.creator_service_account_id is None @@ -154,7 +160,7 @@ async def test_create_api_key_calls_grant_when_flag_on( monkeypatch: pytest.MonkeyPatch, ) -> None: agent = _agent() - use_case, repo, grant, _ = _build_use_case( + use_case, repo, register, _ = _build_use_case( flag_accounts="acct-1", principal=_principal(user_id="user-A", account_id="acct-1"), agent=agent, @@ -169,13 +175,19 @@ async def test_create_api_key_calls_grant_when_flag_on( account_id="acct-1", ) - grant.assert_awaited_once() - granted_resource: AgentexResource = grant.await_args.kwargs["resource"] - assert granted_resource.type == AgentexResourceType.api_key - assert granted_resource.selector == api_key.id + register.assert_awaited_once() + registered_resource: AgentexResource = register.await_args.kwargs["resource"] + assert registered_resource.type == AgentexResourceType.api_key + assert registered_resource.selector == api_key.id + # parent_agent edge is load-bearing — without it the SpiceDB cascade + # `read = ... & parent_agent->read & ...` fails closed for every reader. + registered_parent: AgentexResource = register.await_args.kwargs["parent"] + assert registered_parent is not None + assert registered_parent.type == AgentexResourceType.agent + assert registered_parent.selector == agent.id repo.create.assert_awaited_once() assert api_key.creator_user_id == "user-A" - # Provider.spark.grant returns {} today — no zedtoken yet. + # Provider.spark.register_resource returns None today — no zedtoken yet. assert api_key.spark_authz_zedtoken is None @@ -184,7 +196,7 @@ async def test_create_api_key_calls_grant_when_flag_on( async def test_delete_api_key_calls_revoke_when_flag_on( monkeypatch: pytest.MonkeyPatch, ) -> None: - use_case, repo, _, revoke = _build_use_case( + use_case, repo, _, deregister = _build_use_case( flag_accounts="acct-1", principal=_principal(user_id="user-A", account_id="acct-1"), monkeypatch=monkeypatch, @@ -194,10 +206,10 @@ async def test_delete_api_key_calls_revoke_when_flag_on( await use_case.delete(id=api_key_id, account_id="acct-1") repo.delete.assert_awaited_once_with(id=api_key_id) - revoke.assert_awaited_once() - revoked_resource: AgentexResource = revoke.await_args.kwargs["resource"] - assert revoked_resource.type == AgentexResourceType.api_key - assert revoked_resource.selector == api_key_id + deregister.assert_awaited_once() + deregistered_resource: AgentexResource = deregister.await_args.kwargs["resource"] + assert deregistered_resource.type == AgentexResourceType.api_key + assert deregistered_resource.selector == api_key_id @pytest.mark.asyncio @@ -205,7 +217,7 @@ async def test_delete_api_key_calls_revoke_when_flag_on( async def test_delete_api_key_skips_revoke_when_flag_off( monkeypatch: pytest.MonkeyPatch, ) -> None: - use_case, repo, _, revoke = _build_use_case( + use_case, repo, _, deregister = _build_use_case( flag_accounts="", principal=_principal(user_id="user-A", account_id="acct-1"), monkeypatch=monkeypatch, @@ -214,7 +226,7 @@ async def test_delete_api_key_skips_revoke_when_flag_off( await use_case.delete(id=orm_id(), account_id="acct-1") repo.delete.assert_awaited_once() - revoke.assert_not_called() + deregister.assert_not_called() @pytest.mark.asyncio @@ -222,12 +234,12 @@ async def test_delete_api_key_skips_revoke_when_flag_off( async def test_create_api_key_grant_failure_prevents_db_row( monkeypatch: pytest.MonkeyPatch, ) -> None: - grant = AsyncMock(side_effect=RuntimeError("spark unavailable")) + register_resource = AsyncMock(side_effect=RuntimeError("spark unavailable")) agent = _agent() use_case, repo, _, _ = _build_use_case( flag_accounts="acct-1", principal=_principal(user_id="user-A", account_id="acct-1"), - grant=grant, + register_resource=register_resource, agent=agent, monkeypatch=monkeypatch, ) @@ -249,11 +261,11 @@ async def test_create_api_key_grant_failure_prevents_db_row( async def test_delete_api_key_revoke_failure_does_not_block_delete( monkeypatch: pytest.MonkeyPatch, ) -> None: - revoke = AsyncMock(side_effect=RuntimeError("spark unavailable")) - use_case, repo, _, revoke_ref = _build_use_case( + deregister = AsyncMock(side_effect=RuntimeError("spark unavailable")) + use_case, repo, _, deregister_ref = _build_use_case( flag_accounts="acct-1", principal=_principal(user_id="user-A", account_id="acct-1"), - revoke=revoke, + deregister_resource=deregister, monkeypatch=monkeypatch, ) @@ -261,7 +273,7 @@ async def test_delete_api_key_revoke_failure_does_not_block_delete( await use_case.delete(id=orm_id(), account_id="acct-1") repo.delete.assert_awaited_once() - revoke_ref.assert_awaited_once() + deregister_ref.assert_awaited_once() @pytest.mark.asyncio @@ -272,7 +284,7 @@ async def test_create_api_key_skips_grant_when_no_creator_resolvable( """If neither user_id nor service_account_id is available on the principal, the dual-write is a no-op (logged) and the row still lands without a tuple.""" agent = _agent() - use_case, repo, grant, _ = _build_use_case( + use_case, repo, register, _ = _build_use_case( flag_accounts="acct-1", principal=_principal(user_id=None, account_id="acct-1"), agent=agent, @@ -287,7 +299,7 @@ async def test_create_api_key_skips_grant_when_no_creator_resolvable( account_id="acct-1", ) - grant.assert_not_called() + register.assert_not_called() repo.create.assert_awaited_once() assert api_key.creator_user_id is None assert api_key.creator_service_account_id is None @@ -300,7 +312,7 @@ async def test_delete_by_agent_id_and_key_name_revokes_existing( ) -> None: agent = _agent() existing_id = orm_id() - use_case, repo, _, revoke = _build_use_case( + use_case, repo, _, deregister = _build_use_case( flag_accounts="acct-1", principal=_principal(user_id="user-A", account_id="acct-1"), agent=agent, @@ -324,6 +336,6 @@ async def test_delete_by_agent_id_and_key_name_revokes_existing( ) repo.delete_by_agent_id_and_key_name.assert_awaited_once() - revoke.assert_awaited_once() - revoked_resource: AgentexResource = revoke.await_args.kwargs["resource"] - assert revoked_resource.selector == existing_id + deregister.assert_awaited_once() + deregistered_resource: AgentexResource = deregister.await_args.kwargs["resource"] + assert deregistered_resource.selector == existing_id