Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""add_task_creator_columns

Revision ID: a1f73ada66c5
Revises: 6c942325c828
Create Date: 2026-05-21 15:08:51.441535

"""

from collections.abc import Sequence

import sqlalchemy as sa
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "a1f73ada66c5"
down_revision: str | None = "6c942325c828"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None


def upgrade() -> None:
op.add_column("tasks", sa.Column("creator_user_id", sa.String(), nullable=True))
op.add_column(
"tasks", sa.Column("creator_service_account_id", sa.String(), nullable=True)
)
with op.get_context().autocommit_block():
op.execute(
"CREATE INDEX CONCURRENTLY IF NOT EXISTS ix_tasks_creator_user_id "
"ON tasks (creator_user_id)"
)
op.execute(
"CREATE INDEX CONCURRENTLY IF NOT EXISTS ix_tasks_creator_service_account_id "
"ON tasks (creator_service_account_id)"
)
# Add the CHECK as NOT VALID so the brief ACCESS EXCLUSIVE lock doesn't have to
# wait on an existence scan, then VALIDATE under SHARE UPDATE EXCLUSIVE which
# doesn't block concurrent reads/writes. `tasks` is a high-write table; even the
# short ACCESS EXCLUSIVE held during a vanilla CHECK addition queues behind
# in-flight transactions and blocks readers until it releases.
op.execute(
"ALTER TABLE tasks ADD CONSTRAINT ck_tasks_at_most_one_creator "
"CHECK ((creator_user_id IS NULL) OR (creator_service_account_id IS NULL)) "
"NOT VALID"
)
op.execute("ALTER TABLE tasks VALIDATE CONSTRAINT ck_tasks_at_most_one_creator")


def downgrade() -> None:
op.drop_constraint("ck_tasks_at_most_one_creator", "tasks", type_="check")
with op.get_context().autocommit_block():
op.execute(
Comment on lines +40 to +51
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 NOT VALID + VALIDATE in the same Alembic transaction negates the stated safety guarantee

Both ADD CONSTRAINT ... NOT VALID and VALIDATE CONSTRAINT execute inside the same default Alembic transaction. PostgreSQL holds transaction-level locks until the transaction commits, so the ACCESS EXCLUSIVE lock acquired by ADD CONSTRAINT ... NOT VALID is never released until the entire upgrade() function commits — meaning the full-table validation scan runs while ACCESS EXCLUSIVE is still held. On a large tasks table this blocks all concurrent reads and writes for the same wall-clock duration as a plain ADD CONSTRAINT without NOT VALID. The CONCURRENTLY indexes are correctly placed inside autocommit_block (each statement gets its own autocommit transaction), but the constraint pair is not.

To achieve the stated goal, both constraint statements need to run in separate transactions — either by wrapping them together inside their own autocommit_block (each op.execute becomes its own transaction in autocommit mode, so NOT VALID commits with a brief ACCESS EXCLUSIVE and VALIDATE then runs under SHARE UPDATE EXCLUSIVE with concurrent writes allowed), or by splitting into two separate revisions.

Prompt To Fix With AI
This is a comment left during a code review.
Path: agentex/database/migrations/alembic/versions/2026_05_21_1508_add_task_creator_columns_a1f73ada66c5.py
Line: 40-51

Comment:
**NOT VALID + VALIDATE in the same Alembic transaction negates the stated safety guarantee**

Both `ADD CONSTRAINT ... NOT VALID` and `VALIDATE CONSTRAINT` execute inside the same default Alembic transaction. PostgreSQL holds transaction-level locks until the transaction commits, so the `ACCESS EXCLUSIVE` lock acquired by `ADD CONSTRAINT ... NOT VALID` is never released until the entire `upgrade()` function commits — meaning the full-table validation scan runs while `ACCESS EXCLUSIVE` is still held. On a large `tasks` table this blocks all concurrent reads and writes for the same wall-clock duration as a plain `ADD CONSTRAINT` without `NOT VALID`. The CONCURRENTLY indexes are correctly placed inside `autocommit_block` (each statement gets its own autocommit transaction), but the constraint pair is not.

To achieve the stated goal, both constraint statements need to run in separate transactions — either by wrapping them together inside their own `autocommit_block` (each `op.execute` becomes its own transaction in autocommit mode, so `NOT VALID` commits with a brief `ACCESS EXCLUSIVE` and `VALIDATE` then runs under `SHARE UPDATE EXCLUSIVE` with concurrent writes allowed), or by splitting into two separate revisions.

How can I resolve this? If you propose a fix, please make it concise.

Fix in Cursor Fix in Claude Code Fix in Codex

"DROP INDEX CONCURRENTLY IF EXISTS ix_tasks_creator_service_account_id"
)
op.execute("DROP INDEX CONCURRENTLY IF EXISTS ix_tasks_creator_user_id")
op.drop_column("tasks", "creator_service_account_id")
op.drop_column("tasks", "creator_user_id")
3 changes: 2 additions & 1 deletion agentex/database/migrations/migration_history.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
a9959ebcbe98 -> 6c942325c828 (head), adding task cleaned at
6c942325c828 -> a1f73ada66c5 (head), add_task_creator_columns
a9959ebcbe98 -> 6c942325c828, adding task cleaned at
e9c4ff9e6542 -> a9959ebcbe98, finalize_spans_task_id
9ff3ee32c81b -> e9c4ff9e6542, add_tasks_metadata_gin_index
57c5ed4f59ae -> 9ff3ee32c81b, uppercase deployment status enum labels
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,35 @@ async def list_resources(
)
return response["items"]

async def register_resource(
self,
principal: AgentexAuthPrincipalContext,
resource: AgentexResource,
parent_resource: AgentexResource | None = None,
) -> None:
payload: dict = {
"principal": principal,
"resource": resource.model_dump(),
}
if parent_resource is not None:
payload["parent_resource"] = parent_resource.model_dump()
await HttpRequestHandler.post_with_error_handling(
self.agentex_auth_url, "/v1/authz/register", json=payload
)

async def deregister_resource(
self,
principal: AgentexAuthPrincipalContext,
resource: AgentexResource,
) -> None:
payload = {
"principal": principal,
"resource": resource.model_dump(),
}
await HttpRequestHandler.post_with_error_handling(
self.agentex_auth_url, "/v1/authz/deregister", json=payload
)


DAgentexAuthorization = Annotated[
AgentexAuthorizationProxy, Depends(AgentexAuthorizationProxy)
Expand Down
28 changes: 28 additions & 0 deletions agentex/src/adapters/authorization/port.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,31 @@ async def list_resources(
filter_operation: AuthorizedOperationType = AuthorizedOperationType.read,
) -> Iterable[str]:
"""List resource_ids for a given principal"""

@abstractmethod
async def register_resource(
self,
principal: PrincipalT,
resource: AgentexResource,
parent_resource: AgentexResource | None = None,
) -> None:
"""Register a newly-created resource in the authorization graph.

Atomically writes the relation tuples the schema requires (tenant +
owner, plus an optional typed parent like ``task.parent_agent``).
Distinct from ``grant`` because ``grant`` writes a single role
relation, which is insufficient for schemas that gate access on
``tenant->membership``.
"""

@abstractmethod
async def deregister_resource(
self,
principal: PrincipalT,
resource: AgentexResource,
) -> None:
"""Deregister a resource being deleted from the authorization graph.

Removes every relation tuple written for the resource — keeps the
graph in sync with the application database on row delete.
"""
9 changes: 7 additions & 2 deletions agentex/src/adapters/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
JSON,
BigInteger,
Boolean,
CheckConstraint,
Column,
DateTime,
ForeignKey,
Expand Down Expand Up @@ -75,13 +76,17 @@ class TaskORM(BaseORM):
cleaned_at = Column(DateTime(timezone=True), nullable=True)
params = Column(JSONB, nullable=True)
task_metadata = Column(JSONB, nullable=True)
creator_user_id = Column(String, nullable=True, index=True)
creator_service_account_id = Column(String, nullable=True, index=True)
# Many-to-Many relationship with agents
agents = relationship("AgentORM", secondary="task_agents", back_populates="tasks")

# Indexes for efficient querying
__table_args__ = (
# Index for filtering tasks by status (used in list queries)
Index("ix_tasks_status", "status"),
CheckConstraint(
"creator_user_id IS NULL OR creator_service_account_id IS NULL",
name="ck_tasks_at_most_one_creator",
),
)


Expand Down
9 changes: 9 additions & 0 deletions agentex/src/config/environment_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ class EnvVarKeys(str, Enum):
AGENTEX_SERVER_TASK_QUEUE = "AGENTEX_SERVER_TASK_QUEUE"
ENABLE_HEALTH_CHECK_WORKFLOW = "ENABLE_HEALTH_CHECK_WORKFLOW"
WEBHOOK_REQUEST_TIMEOUT = "WEBHOOK_REQUEST_TIMEOUT"
FGAC_TASKS_DUAL_WRITE = "FGAC_TASKS_DUAL_WRITE"


class Environment(str, Enum):
Expand Down Expand Up @@ -114,6 +115,10 @@ class EnvironmentVariables(BaseModel):
AGENTEX_SERVER_TASK_QUEUE: str | None = None
ENABLE_HEALTH_CHECK_WORKFLOW: bool = False
WEBHOOK_REQUEST_TIMEOUT: float = 15.0 # Webhook request timeout in seconds
# AGX1-274: gate the task FGAC dual-write call sites. Off by default so
# rollout is operator-controlled per environment. Mirrors KB's
# ``FGAC_KNOWLEDGE_BASES_DUAL_WRITE`` shape.
FGAC_TASKS_DUAL_WRITE: bool = False

@classmethod
def refresh(cls, force_refresh: bool = False) -> EnvironmentVariables | None:
Expand Down Expand Up @@ -203,6 +208,10 @@ def refresh(cls, force_refresh: bool = False) -> EnvironmentVariables | None:
WEBHOOK_REQUEST_TIMEOUT=float(
os.environ.get(EnvVarKeys.WEBHOOK_REQUEST_TIMEOUT, "15.0")
),
FGAC_TASKS_DUAL_WRITE=(
os.environ.get(EnvVarKeys.FGAC_TASKS_DUAL_WRITE, "false").lower()
== "true"
),
)
refreshed_environment_variables = environment_variables
return refreshed_environment_variables
Expand Down
8 changes: 8 additions & 0 deletions agentex/src/domain/entities/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ class TaskEntity(BaseModel):
None,
title="Task metadata",
)
creator_user_id: str | None = Field(
None,
title="Identity ID of the user who created this task",
)
creator_service_account_id: str | None = Field(
None,
title="Service identity ID of the service account that created this task",
)

# allow extra fields for agents relationships
model_config = ConfigDict(extra="allow")
Expand Down
63 changes: 63 additions & 0 deletions agentex/src/domain/services/authorization_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,69 @@ async def check(
)
return result

async def register_resource(
self,
resource: AgentexResource,
*,
parent_resource: AgentexResource | None = None,
principal_context=...,
) -> None:
"""Register a freshly-created resource with the authorization graph.

Used immediately after persisting a new row to write the tenant +
owner (and optionally typed parent) relation tuples atomically.
Distinct from ``grant`` because ``grant`` only writes a single
role relation, which is insufficient for schemas (e.g. ``task``)
that require a ``tenant->membership`` gate.
"""
if self._bypass():
logger.info(
f"Authorization bypassed for register_resource on {resource.type}:{resource.selector}"
)
return None

logger.info(
"[authorization_service] Registering resource %s:%s for principal %s (parent=%s)",
resource.type,
resource.selector,
self.principal_context,
parent_resource,
)
await self.gateway.register_resource(
principal_context
if principal_context is not ...
else self.principal_context,
resource,
parent_resource,
)

async def deregister_resource(
self,
resource: AgentexResource,
*,
principal_context=...,
) -> None:
"""Remove every relation tuple written for the resource — used when
deleting the underlying database row."""
if self._bypass():
logger.info(
f"Authorization bypassed for deregister_resource on {resource.type}:{resource.selector}"
)
return None

logger.info(
"[authorization_service] Deregistering resource %s:%s for principal %s",
resource.type,
resource.selector,
self.principal_context,
)
await self.gateway.deregister_resource(
principal_context
if principal_context is not ...
else self.principal_context,
resource,
)

async def list_resources(
self,
filter_resource: AgentexResourceType,
Expand Down
Loading
Loading