From 91435cac45b7be29c80aee71c5fd5bb9202c63f9 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 8 May 2026 19:00:50 +0000 Subject: [PATCH 1/5] fix(addie): owner evaluate_agent_quality writes to canonical compliance state Closes the 12-hour gap between owner-triggered storyboard runs and the public /api/registry/agents/:url/compliance endpoint (issue #4247, PR 1 of 4). When evaluate_agent_quality is triggered by the agent owner, the result is now written to agent_compliance_status + agent_compliance_runs + agent_storyboard_status with triggered_by = 'owner_test'. Non-owner runs continue writing to agent_test_history (deprecated in PR 3). Migration 471 adds 'owner_test' to both triggered_by CHECK constraints. notifyComplianceChange is intentionally suppressed for owner runs to prevent iteration-loop Slack spam. https://claude.ai/code/session_01UNHkGhBXk9XD2dpzvSLdhb --- .changeset/unify-owner-compliance-writes.md | 10 ++++ server/src/addie/config-version.ts | 2 +- server/src/addie/mcp/member-tools.ts | 56 ++++++++++++++++++- server/src/db/compliance-db.ts | 2 +- .../471_owner_test_triggered_by.sql | 16 ++++++ 5 files changed, 83 insertions(+), 3 deletions(-) create mode 100644 .changeset/unify-owner-compliance-writes.md create mode 100644 server/src/db/migrations/471_owner_test_triggered_by.sql diff --git a/.changeset/unify-owner-compliance-writes.md b/.changeset/unify-owner-compliance-writes.md new file mode 100644 index 0000000000..1bdb5c7f9e --- /dev/null +++ b/.changeset/unify-owner-compliance-writes.md @@ -0,0 +1,10 @@ +--- +--- + +PR 1 of 4 in the compliance-state unification initiative (issue #4247): owner-triggered +`evaluate_agent_quality` runs now write to canonical compliance tables +(`agent_compliance_status`, `agent_compliance_runs`, `agent_storyboard_status`) with +`triggered_by = 'owner_test'`, closing the 12-hour gap between owner tests and the +public `/api/registry/agents/:url/compliance` endpoint. Non-owner runs continue +writing to `agent_test_history` (deprecated in PR 3). Adds `'owner_test'` to both +`triggered_by` CHECK constraints via migration 471. diff --git a/server/src/addie/config-version.ts b/server/src/addie/config-version.ts index f184b82fcd..bf00b2c4d6 100644 --- a/server/src/addie/config-version.ts +++ b/server/src/addie/config-version.ts @@ -30,7 +30,7 @@ import { loadRules, loadResponseStyle } from './rules/index.js'; * Format: YYYY.MM.N where N is incremented for multiple changes in a month * Example: 2025.01.1, 2025.01.2, 2025.02.1 */ -export const CODE_VERSION = '2026.04.6'; +export const CODE_VERSION = '2026.05.1'; // Types export interface ConfigVersion { diff --git a/server/src/addie/mcp/member-tools.ts b/server/src/addie/mcp/member-tools.ts index a37b108c1d..d2046c5e7c 100644 --- a/server/src/addie/mcp/member-tools.ts +++ b/server/src/addie/mcp/member-tools.ts @@ -34,6 +34,7 @@ import { SAMPLE_BRIEFS, classifyCapabilityResolutionError, presentCapabilityResolutionError, + complianceResultToDbInput, type ComplyOptions, type ComplianceTrack, } from '../services/compliance-testing.js'; @@ -3559,8 +3560,61 @@ export function createMemberToolHandlers( ); } - // Record result if the user has an org with this agent saved + // Record result when the user has an org context for this agent. if (organizationId) { + // Write to canonical compliance tables when the calling org owns this agent. + // Mirrors resolveAgentOwnerOrg (registry-api.ts:4733) — joins organization_memberships + // to verify the acting user is still an active member of the owning org. + // Non-owner runs skip the canonical write and fall through to the legacy + // agent_test_history path below. + const workosUserId = memberContext?.workos_user?.workos_user_id; + let isAgentOwner = false; + if (workosUserId) { + try { + const ownerCheck = await query( + `SELECT 1 FROM member_profiles mp + JOIN organization_memberships om + ON om.workos_organization_id = mp.workos_organization_id + WHERE mp.workos_organization_id = $1 + AND mp.agents @> $2::jsonb + AND om.workos_user_id = $3 + LIMIT 1`, + [organizationId, JSON.stringify([{ url: resolved.resolvedUrl }]), workosUserId], + ); + isAgentOwner = ownerCheck.rows.length > 0; + } catch (ownerCheckError) { + logger.warn({ ownerCheckError }, 'evaluate_agent_quality: owner check failed, skipping canonical write'); + } + } + + if (isAgentOwner) { + try { + const metadata = await complianceDb.getRegistryMetadata(resolved.resolvedUrl); + // Skip canonical write if the owner has opted out of compliance monitoring. + if (!metadata?.compliance_opt_out) { + const dbInput = { + ...complianceResultToDbInput( + result, + resolved.resolvedUrl, + metadata?.lifecycle_stage ?? 'production', + 'owner_test', + ), + // Owner test runs are not dry runs — they update the live public record. + // (complianceResultToDbInput hard-codes dry_run: true; override here.) + dry_run: false, + }; + await complianceDb.recordComplianceRun(dbInput); + // notifyComplianceChange intentionally omitted: owner test runs are + // exploratory; compliance-change notifications fire on heartbeat + // transitions only to prevent iteration-loop spam. + } + } catch (error) { + logger.warn({ error, agentUrl: resolved.resolvedUrl }, 'Could not write owner test result to canonical compliance state'); + } + } + + // Legacy write to agent_contexts + agent_test_history. Retained for + // backward compatibility until PR 3 migrates callers and drops the table. try { const context = await agentContextDb.getByOrgAndUrl(organizationId, resolved.resolvedUrl); if (context) { diff --git a/server/src/db/compliance-db.ts b/server/src/db/compliance-db.ts index 236abff905..0ff9180249 100644 --- a/server/src/db/compliance-db.ts +++ b/server/src/db/compliance-db.ts @@ -11,7 +11,7 @@ const logger = baseLogger.child({ module: 'compliance-db' }); export type LifecycleStage = 'development' | 'testing' | 'production' | 'deprecated'; export type ComplianceStatus = 'passing' | 'degraded' | 'failing' | 'unknown'; export type OverallRunStatus = 'passing' | 'failing' | 'partial'; -export type TriggeredBy = 'heartbeat' | 'manual' | 'webhook'; +export type TriggeredBy = 'heartbeat' | 'manual' | 'webhook' | 'owner_test'; export type TrackStatus = 'pass' | 'fail' | 'partial' | 'skip' | 'silent'; /** diff --git a/server/src/db/migrations/471_owner_test_triggered_by.sql b/server/src/db/migrations/471_owner_test_triggered_by.sql new file mode 100644 index 0000000000..0e450bb1ad --- /dev/null +++ b/server/src/db/migrations/471_owner_test_triggered_by.sql @@ -0,0 +1,16 @@ +-- Add 'owner_test' to triggered_by CHECK constraints in compliance tables. +-- Owner-triggered storyboard runs (via evaluate_agent_quality) now write to +-- canonical compliance state, distinguished from heartbeat and dashboard-manual +-- runs by triggered_by = 'owner_test'. See issue #4247. + +ALTER TABLE agent_compliance_runs + DROP CONSTRAINT IF EXISTS valid_triggered_by, + ADD CONSTRAINT valid_triggered_by CHECK ( + triggered_by IN ('heartbeat', 'manual', 'webhook', 'owner_test') + ); + +ALTER TABLE agent_storyboard_status + DROP CONSTRAINT IF EXISTS valid_storyboard_triggered_by, + ADD CONSTRAINT valid_storyboard_triggered_by CHECK ( + triggered_by IS NULL OR triggered_by IN ('heartbeat', 'manual', 'webhook', 'owner_test') + ); From 705914717d58c04c795fad6512c6a4b78defe8d9 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 8 May 2026 19:02:21 +0000 Subject: [PATCH 2/5] chore(dist): add generated onboarding-openapi schema files Build-generated output produced by npm run build; matches the tracking pattern of member-agents-openapi.js and registry.js already in dist/schemas/. https://claude.ai/code/session_01UNHkGhBXk9XD2dpzvSLdhb --- dist/schemas/onboarding-openapi.d.ts | 27 +++++ dist/schemas/onboarding-openapi.d.ts.map | 1 + dist/schemas/onboarding-openapi.js | 135 +++++++++++++++++++++++ dist/schemas/onboarding-openapi.js.map | 1 + 4 files changed, 164 insertions(+) create mode 100644 dist/schemas/onboarding-openapi.d.ts create mode 100644 dist/schemas/onboarding-openapi.d.ts.map create mode 100644 dist/schemas/onboarding-openapi.js create mode 100644 dist/schemas/onboarding-openapi.js.map diff --git a/dist/schemas/onboarding-openapi.d.ts b/dist/schemas/onboarding-openapi.d.ts new file mode 100644 index 0000000000..ed29eaf6de --- /dev/null +++ b/dist/schemas/onboarding-openapi.d.ts @@ -0,0 +1,27 @@ +/** + * OpenAPI registrations for the onboarding REST surface. + * + * `POST /api/organizations` has existed in production for a long time but + * has only ever been documented as a private endpoint exercised by the AAO + * dashboard's `/onboarding` form. Surfacing it in the public spec is the + * minimum-surface answer to the storefront-bootstrap question: a + * third-party app holding only a user's OAuth token needs *one* documented + * call to materialize the org, then `POST /api/me/agents` to land an agent + * (which auto-creates the member profile on first call). + * + * Two fields the handler accepts but the public schema deliberately omits: + * + * - `membership_tier` — owned exclusively by the Stripe webhook. Accepting + * it from the caller would let any user stamp tier intent on their org + * row, leaking tier-gated UI state until/unless a real subscription + * overwrites the column. + * - `corporate_domain` — server derives the value from the authenticated + * user's email. Accepting it as a field invited 400s when a caller's + * value disagreed with their email and gave nothing back when it agreed. + * + * Kept in its own module so the spec generator's import graph stays free + * of route handlers (each route file's transitive imports pull in WorkOS + * init, which fails at module load without env vars). + */ +export {}; +//# sourceMappingURL=onboarding-openapi.d.ts.map \ No newline at end of file diff --git a/dist/schemas/onboarding-openapi.d.ts.map b/dist/schemas/onboarding-openapi.d.ts.map new file mode 100644 index 0000000000..9e449fbf0f --- /dev/null +++ b/dist/schemas/onboarding-openapi.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"onboarding-openapi.d.ts","sourceRoot":"","sources":["../../server/src/schemas/onboarding-openapi.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG"} \ No newline at end of file diff --git a/dist/schemas/onboarding-openapi.js b/dist/schemas/onboarding-openapi.js new file mode 100644 index 0000000000..d3e3074486 --- /dev/null +++ b/dist/schemas/onboarding-openapi.js @@ -0,0 +1,135 @@ +/** + * OpenAPI registrations for the onboarding REST surface. + * + * `POST /api/organizations` has existed in production for a long time but + * has only ever been documented as a private endpoint exercised by the AAO + * dashboard's `/onboarding` form. Surfacing it in the public spec is the + * minimum-surface answer to the storefront-bootstrap question: a + * third-party app holding only a user's OAuth token needs *one* documented + * call to materialize the org, then `POST /api/me/agents` to land an agent + * (which auto-creates the member profile on first call). + * + * Two fields the handler accepts but the public schema deliberately omits: + * + * - `membership_tier` — owned exclusively by the Stripe webhook. Accepting + * it from the caller would let any user stamp tier intent on their org + * row, leaking tier-gated UI state until/unless a real subscription + * overwrites the column. + * - `corporate_domain` — server derives the value from the authenticated + * user's email. Accepting it as a field invited 400s when a caller's + * value disagreed with their email and gave nothing back when it agreed. + * + * Kept in its own module so the spec generator's import graph stays free + * of route handlers (each route file's transitive imports pull in WorkOS + * init, which fails at module load without env vars). + */ +import { z } from 'zod'; +import { registry, ErrorSchema } from './registry.js'; +const OrganizationCompanyTypeSchema = z + .enum(['adtech', 'agency', 'brand', 'publisher', 'data', 'ai', 'other']) + .openapi('OrganizationCompanyType', { + description: "Coarse classification of the organization's role in the open ad ecosystem. Drives default verification badges and the member profile's display category.", +}); +const OrganizationRevenueTierSchema = z + .enum(['under_1m', '1m_5m', '5m_50m', '50m_250m', '250m_1b', '1b_plus']) + .openapi('OrganizationRevenueTier', { + description: 'Annual revenue band, USD. Drives membership-tier eligibility for company-tier seats.', +}); +const CreateOrganizationInputSchema = z + .object({ + organization_name: z.string().min(1).max(200).openapi({ + description: "Display name for the organization. Used both as the org row name and (when auto-bootstrapping a member profile via the first agent registration) as the profile's `display_name`.", + example: 'Acme Media', + }), + is_personal: z.boolean().optional().openapi({ + description: 'Set to `true` to create a personal workspace instead of a corporate organization. Personal workspaces skip corporate-domain verification, are limited to one per user, and cannot host the `company_*` membership tiers.', + default: false, + }), + company_type: OrganizationCompanyTypeSchema.optional(), + revenue_tier: OrganizationRevenueTierSchema.optional(), + marketing_opt_in: z.boolean().optional().openapi({ + description: 'Whether the caller opted in to AAO marketing communications. Recorded once per user (not overwritten on subsequent calls). Independent of Terms-of-Service consent, which is recorded server-side from the request context.', + default: false, + }), +}) + .openapi('CreateOrganizationInput', { + description: [ + 'Request body for `POST /api/organizations`.', + "Bootstraps a WorkOS organization, mirrors the caller as `owner`, records the caller's ToS / privacy-policy acceptance, and (for non-personal orgs) inserts an email-verified record into `organization_domains` so subsequent registry calls can skip explicit domain-verification.", + "Membership tier and corporate domain are *not* caller-supplied: the tier is set by the Stripe webhook on subscription events, and the corporate domain is derived from the authenticated user's email.", + ].join('\n\n'), +}); +const CreateOrganizationResponseSchema = z + .object({ + success: z.boolean().optional(), + organization: z + .object({ + id: z.string().openapi({ example: 'org_01HXZAB123' }), + name: z.string().openapi({ example: 'Acme Media' }), + }) + .optional(), + id: z.string().optional().openapi({ + description: "Set on the **prospect-adoption** path: when an org with the user's email domain already exists in a `prospect` state (i.e. the registry pre-recorded it from a brand crawl but no human had claimed it yet), this call adopts that org for the caller instead of creating a new one.", + }), + name: z.string().optional(), + adopted: z.boolean().optional().openapi({ + description: '`true` when the response is the prospect-adoption path. When `true`, no new WorkOS organization was created — the caller is now the owner of an existing prospect record.', + }), +}) + .openapi('CreateOrganizationResponse', { + description: 'Response from `POST /api/organizations`. The body shape varies by path: a fresh creation returns `{ success: true, organization: { id, name } }`; a prospect adoption returns `{ id, name, adopted: true }` directly. Both paths are 2xx; downstream callers should treat any `2xx` as "the org now exists and you are an owner of it" and read whichever id is present.', +}); +registry.registerPath({ + method: 'post', + path: '/api/organizations', + operationId: 'createOrganization', + summary: 'Create or adopt my organization', + description: [ + "Bootstrap the caller's organization explicitly. Use this when the caller wants to control the organization name, `company_type`, `revenue_tier`, or `is_personal` flag before any agents are registered.", + "**Most storefront-style integrations don't need this call** — `POST /api/me/agents` will auto-create an org for a fresh OAuth user (corporate or personal workspace based on the email domain) and surface `org_auto_created: true` in the response. Reach for `POST /api/organizations` only when the auto-derived defaults aren't acceptable.", + 'Three outcomes depending on the caller\'s state:', + "- **Fresh create** (most common): a new WorkOS organization is created, the caller is added as `owner`, the corporate domain is recorded as email-verified, and ToS / privacy-policy acceptance is logged from the request context. Returns `{ success: true, organization: { id, name } }`.", + "- **Prospect adoption**: an organization with the caller's email domain already exists as a `prospect` (the registry pre-recorded it from a brand crawl but no human had claimed it yet). The caller is promoted to `owner` of the existing record instead of forking a duplicate. Returns `{ id, name, adopted: true }`.", + '- **Already-active conflict**: the org exists and is already claimed by another paying member or a previously joined user. Returns `409` with the existing org id so the caller can switch to a join-request flow (`POST /api/organizations/:orgId/join-requests`) instead of trying to register a duplicate.', + 'Tier transitions happen via the billing flow only — there is no `membership_tier` field on this endpoint. After org creation, send the user to `POST /api/checkout-session` (or the dashboard `/membership` page) to start a subscription; the Stripe webhook is the sole writer of `organizations.membership_tier`.', + 'Rate-limited per user: `15` failed attempts per hour; successful calls do not count against the limit so a legitimate registration is never penalized by earlier validation errors.', + ].join('\n\n'), + tags: ['Onboarding'], + security: [{ bearerAuth: [] }, { oauth2: [] }], + request: { + body: { content: { 'application/json': { schema: CreateOrganizationInputSchema } } }, + }, + responses: { + 200: { + description: 'Prospect adoption — an existing prospect organization for this domain was claimed by the caller. Body is `{ id, name, adopted: true }`.', + content: { 'application/json': { schema: CreateOrganizationResponseSchema } }, + }, + 201: { + description: 'New organization created. Body is `{ success: true, organization: { id, name } }`. The caller is the `owner`; the corporate domain is recorded as email-verified for downstream registry calls.', + content: { 'application/json': { schema: CreateOrganizationResponseSchema } }, + }, + 400: { + description: [ + 'One of:', + '- `organization_name` missing or invalid', + '- `company_type` / `revenue_tier` value not in the documented enum', + "- caller is on a personal-email domain (gmail.com, yahoo.com, …) and is trying to register a corporate org — register `is_personal: true` instead", + '- per-user organization cap reached (10 orgs per user)', + ].join('\n'), + content: { 'application/json': { schema: ErrorSchema } }, + }, + 401: { + description: 'Authentication required', + content: { 'application/json': { schema: ErrorSchema } }, + }, + 409: { + description: "An active organization already exists for this caller's email domain. The body includes `existing_org_id` and `existing_org_name`; the caller should switch to the join-request flow rather than retrying.", + content: { 'application/json': { schema: ErrorSchema } }, + }, + 429: { + description: 'Rate limit exceeded — 15 failed attempts per hour per user. Successful calls do not count against the limit.', + content: { 'application/json': { schema: ErrorSchema } }, + }, + }, +}); +//# sourceMappingURL=onboarding-openapi.js.map \ No newline at end of file diff --git a/dist/schemas/onboarding-openapi.js.map b/dist/schemas/onboarding-openapi.js.map new file mode 100644 index 0000000000..9fb894c315 --- /dev/null +++ b/dist/schemas/onboarding-openapi.js.map @@ -0,0 +1 @@ +{"version":3,"file":"onboarding-openapi.js","sourceRoot":"","sources":["../../server/src/schemas/onboarding-openapi.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AAEtD,MAAM,6BAA6B,GAAG,CAAC;KACpC,IAAI,CAAC,CAAC,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;KACvE,OAAO,CAAC,yBAAyB,EAAE;IAClC,WAAW,EACT,0JAA0J;CAC7J,CAAC,CAAC;AAEL,MAAM,6BAA6B,GAAG,CAAC;KACpC,IAAI,CAAC,CAAC,UAAU,EAAE,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,SAAS,EAAE,SAAS,CAAC,CAAC;KACvE,OAAO,CAAC,yBAAyB,EAAE;IAClC,WAAW,EACT,sFAAsF;CACzF,CAAC,CAAC;AAEL,MAAM,6BAA6B,GAAG,CAAC;KACpC,MAAM,CAAC;IACN,iBAAiB,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC;QACpD,WAAW,EACT,mLAAmL;QACrL,OAAO,EAAE,YAAY;KACtB,CAAC;IACF,WAAW,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC;QAC1C,WAAW,EACT,0NAA0N;QAC5N,OAAO,EAAE,KAAK;KACf,CAAC;IACF,YAAY,EAAE,6BAA6B,CAAC,QAAQ,EAAE;IACtD,YAAY,EAAE,6BAA6B,CAAC,QAAQ,EAAE;IACtD,gBAAgB,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC;QAC/C,WAAW,EACT,6NAA6N;QAC/N,OAAO,EAAE,KAAK;KACf,CAAC;CACH,CAAC;KACD,OAAO,CAAC,yBAAyB,EAAE;IAClC,WAAW,EAAE;QACX,6CAA6C;QAC7C,qRAAqR;QACrR,wMAAwM;KACzM,CAAC,IAAI,CAAC,MAAM,CAAC;CACf,CAAC,CAAC;AAEL,MAAM,gCAAgC,GAAG,CAAC;KACvC,MAAM,CAAC;IACN,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE;IAC/B,YAAY,EAAE,CAAC;SACZ,MAAM,CAAC;QACN,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,EAAE,OAAO,EAAE,gBAAgB,EAAE,CAAC;QACrD,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,EAAE,OAAO,EAAE,YAAY,EAAE,CAAC;KACpD,CAAC;SACD,QAAQ,EAAE;IACb,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC;QAChC,WAAW,EACT,sRAAsR;KACzR,CAAC;IACF,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC3B,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC;QACtC,WAAW,EACT,2KAA2K;KAC9K,CAAC;CACH,CAAC;KACD,OAAO,CAAC,4BAA4B,EAAE;IACrC,WAAW,EACT,0WAA0W;CAC7W,CAAC,CAAC;AAEL,QAAQ,CAAC,YAAY,CAAC;IACpB,MAAM,EAAE,MAAM;IACd,IAAI,EAAE,oBAAoB;IAC1B,WAAW,EAAE,oBAAoB;IACjC,OAAO,EAAE,iCAAiC;IAC1C,WAAW,EAAE;QACX,0MAA0M;QAC1M,iVAAiV;QACjV,kDAAkD;QAClD,8RAA8R;QAC9R,2TAA2T;QAC3T,+SAA+S;QAC/S,sTAAsT;QACtT,qLAAqL;KACtL,CAAC,IAAI,CAAC,MAAM,CAAC;IACd,IAAI,EAAE,CAAC,YAAY,CAAC;IACpB,QAAQ,EAAE,CAAC,EAAE,UAAU,EAAE,EAAE,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAC;IAC9C,OAAO,EAAE;QACP,IAAI,EAAE,EAAE,OAAO,EAAE,EAAE,kBAAkB,EAAE,EAAE,MAAM,EAAE,6BAA6B,EAAE,EAAE,EAAE;KACrF;IACD,SAAS,EAAE;QACT,GAAG,EAAE;YACH,WAAW,EACT,yIAAyI;YAC3I,OAAO,EAAE,EAAE,kBAAkB,EAAE,EAAE,MAAM,EAAE,gCAAgC,EAAE,EAAE;SAC9E;QACD,GAAG,EAAE;YACH,WAAW,EACT,iMAAiM;YACnM,OAAO,EAAE,EAAE,kBAAkB,EAAE,EAAE,MAAM,EAAE,gCAAgC,EAAE,EAAE;SAC9E;QACD,GAAG,EAAE;YACH,WAAW,EAAE;gBACX,SAAS;gBACT,0CAA0C;gBAC1C,oEAAoE;gBACpE,mJAAmJ;gBACnJ,wDAAwD;aACzD,CAAC,IAAI,CAAC,IAAI,CAAC;YACZ,OAAO,EAAE,EAAE,kBAAkB,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE;SACzD;QACD,GAAG,EAAE;YACH,WAAW,EAAE,yBAAyB;YACtC,OAAO,EAAE,EAAE,kBAAkB,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE;SACzD;QACD,GAAG,EAAE;YACH,WAAW,EACT,4MAA4M;YAC9M,OAAO,EAAE,EAAE,kBAAkB,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE;SACzD;QACD,GAAG,EAAE;YACH,WAAW,EACT,8GAA8G;YAChH,OAAO,EAAE,EAAE,kBAAkB,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,EAAE;SACzD;KACF;CACF,CAAC,CAAC"} \ No newline at end of file From 0c996a6cae6ff76af4fbf04203dd653a8ccfad31 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 8 May 2026 19:25:56 +0000 Subject: [PATCH 3/5] fix(addie): add verdict_source to compliance response + last-write-wins test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address review feedback from @EmmaLouise2018 on PR #4250: 1. `verdict_source` field on /api/registry/agents/:url/compliance — `AgentComplianceDetailSchema` gains optional `verdict_source`: 'heartbeat' | 'owner_test' | 'manual' | 'webhook' | null — `getComplianceStatus` and `bulkGetComplianceStatus` join `agent_compliance_runs` via LATERAL subquery (dry_run=false, ORDER BY tested_at DESC LIMIT 1) to surface the triggered_by of the most recent run. No migration needed. — Endpoint response emits `verdict_source: status.last_triggered_by`. — `AgentComplianceStatus` interface gets `last_triggered_by` field. 2. Last-write-wins contract test — New `compliance-db-last-write-wins.test.ts` pins the ON CONFLICT DO UPDATE semantics: every recordComplianceRun call overwrites agent_compliance_status regardless of triggered_by source. A future change to first-write-wins or priority ordering would break these tests. https://claude.ai/code/session_01NVVqgeSGevUGXgDbMw1XKZ --- server/src/db/compliance-db.ts | 18 +- server/src/routes/registry-api.ts | 1 + server/src/schemas/registry.ts | 2 + .../compliance-db-last-write-wins.test.ts | 204 ++++++++++++++++++ 4 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 server/tests/unit/compliance-db-last-write-wins.test.ts diff --git a/server/src/db/compliance-db.ts b/server/src/db/compliance-db.ts index 0ff9180249..d3770fb889 100644 --- a/server/src/db/compliance-db.ts +++ b/server/src/db/compliance-db.ts @@ -118,6 +118,8 @@ export interface AgentComplianceStatus { previous_status: string | null; status_changed_at: Date | null; updated_at: Date; + /** triggered_by of the most recent non-dry-run in agent_compliance_runs */ + last_triggered_by: TriggeredBy | null; } export type StoryboardStatus = 'passing' | 'failing' | 'partial' | 'untested'; @@ -427,9 +429,15 @@ export class ComplianceDatabase { async getComplianceStatus(agentUrl: string): Promise { const result = await query( - `SELECT s.*, COALESCE(m.lifecycle_stage, 'production') AS lifecycle_stage + `SELECT s.*, COALESCE(m.lifecycle_stage, 'production') AS lifecycle_stage, + r.triggered_by AS last_triggered_by FROM agent_compliance_status s LEFT JOIN agent_registry_metadata m ON m.agent_url = s.agent_url + LEFT JOIN LATERAL ( + SELECT triggered_by FROM agent_compliance_runs + WHERE agent_url = s.agent_url AND dry_run = false + ORDER BY tested_at DESC LIMIT 1 + ) r ON true WHERE s.agent_url = $1`, [agentUrl], ); @@ -455,9 +463,15 @@ export class ComplianceDatabase { if (agentUrls.length === 0) return new Map(); const result = await query( - `SELECT s.*, COALESCE(m.lifecycle_stage, 'production') AS lifecycle_stage + `SELECT s.*, COALESCE(m.lifecycle_stage, 'production') AS lifecycle_stage, + r.triggered_by AS last_triggered_by FROM agent_compliance_status s LEFT JOIN agent_registry_metadata m ON m.agent_url = s.agent_url + LEFT JOIN LATERAL ( + SELECT triggered_by FROM agent_compliance_runs + WHERE agent_url = s.agent_url AND dry_run = false + ORDER BY tested_at DESC LIMIT 1 + ) r ON true WHERE s.agent_url = ANY($1)`, [agentUrls], ); diff --git a/server/src/routes/registry-api.ts b/server/src/routes/registry-api.ts index 925aec6cda..936b7b5dc0 100644 --- a/server/src/routes/registry-api.ts +++ b/server/src/routes/registry-api.ts @@ -4253,6 +4253,7 @@ export function createRegistryApiRouter(config: RegistryApiConfig): Router { membership_tier_label: ownerMembership.membership_tier_label, subscription_status: ownerMembership.subscription_status, is_api_access_tier: ownerMembership.is_api_access_tier, + verdict_source: status.last_triggered_by ?? null, verified: badges.length > 0, verified_badges: badges.map(b => ({ role: b.role, diff --git a/server/src/schemas/registry.ts b/server/src/schemas/registry.ts index 4a59bdd2be..f29a961f11 100644 --- a/server/src/schemas/registry.ts +++ b/server/src/schemas/registry.ts @@ -341,6 +341,8 @@ export const AgentComplianceDetailSchema = z membership_tier_label: z.string().nullable().optional().openapi({ description: "Owner-scoped: human-readable label for membership_tier (e.g. 'Builder'). Null for non-owners." }), subscription_status: z.string().nullable().optional().openapi({ description: "Owner-scoped: the agent owner's subscription status (active, past_due, trialing, etc.). Null for non-owners." }), is_api_access_tier: z.boolean().optional().openapi({ description: "Owner-scoped: true when the owner's tier and subscription status grant badge eligibility. False for non-owners. Single source of truth — UI should not re-derive." }), + verdict_source: z.enum(["heartbeat", "owner_test", "manual", "webhook"]).nullable().optional() + .openapi({ description: "triggered_by value of the most recent non-dry-run compliance check. 'heartbeat' = scheduled run; 'owner_test' = agent owner triggered via evaluate_agent_quality. Null when no run has been recorded yet." }), verified: z.boolean().optional(), verified_badges: z.array(VerificationBadgeSchema).optional(), }) diff --git a/server/tests/unit/compliance-db-last-write-wins.test.ts b/server/tests/unit/compliance-db-last-write-wins.test.ts new file mode 100644 index 0000000000..bec028e4bc --- /dev/null +++ b/server/tests/unit/compliance-db-last-write-wins.test.ts @@ -0,0 +1,204 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest'; + +vi.mock('../../src/db/client.js', () => ({ + query: vi.fn(), + getClient: vi.fn(), +})); + +vi.mock('../../src/db/encryption.js', () => ({ + decrypt: vi.fn(), + encrypt: vi.fn(), + deriveKey: vi.fn(), +})); + +import { ComplianceDatabase } from '../../src/db/compliance-db.js'; +import { query, getClient } from '../../src/db/client.js'; + +const mockedQuery = vi.mocked(query); +const mockedGetClient = vi.mocked(getClient); + +const EMPTY = { rows: [], rowCount: 0, command: '', oid: 0, fields: [] }; + +function makeTransactionClient(queryResponses: Array<{ rows: any[] }>) { + const calls: string[] = []; + let idx = 0; + const client = { + query: vi.fn(async (sql: string) => { + calls.push(typeof sql === 'string' ? sql.trim().split(/\s+/)[0] : sql); + const resp = queryResponses[idx] ?? EMPTY; + idx++; + return { ...EMPTY, ...resp }; + }), + release: vi.fn(), + _calls: calls, + }; + return client; +} + +const AGENT_URL = 'https://agent.example.com'; + +function makeRunRow(triggeredBy: string) { + return { + id: 'run-001', + agent_url: AGENT_URL, + lifecycle_stage: 'production', + overall_status: 'passing', + headline: null, + total_duration_ms: 100, + tested_at: new Date(), + tracks_json: [], + tracks_passed: 1, + tracks_failed: 0, + tracks_skipped: 0, + tracks_partial: 0, + agent_profile_json: null, + observations_json: null, + triggered_by: triggeredBy, + dry_run: false, + }; +} + +const minimalInput = (triggeredBy: 'heartbeat' | 'owner_test') => ({ + agent_url: AGENT_URL, + lifecycle_stage: 'production' as const, + overall_status: 'passing' as const, + tracks_json: [{ track: 'core', status: 'pass' as const, scenario_count: 1, passed_count: 1, duration_ms: 100 }], + tracks_passed: 1, + tracks_failed: 0, + tracks_skipped: 0, + tracks_partial: 0, + triggered_by: triggeredBy, + dry_run: false, +}); + +describe('ComplianceDatabase — last-write-wins on agent_compliance_status', () => { + let db: ComplianceDatabase; + + beforeEach(() => { + db = new ComplianceDatabase(); + vi.clearAllMocks(); + }); + + /** + * Contract: agent_compliance_status uses ON CONFLICT DO UPDATE (not DO NOTHING). + * Every recordComplianceRun call — regardless of triggered_by — overwrites the + * materialized status row. A future change to "pick highest-priority source" or + * "first-write-wins" would break this test. + */ + it('always upserts status regardless of triggered_by — last-write-wins', async () => { + const statusRow = { rows: [{ status: 'passing', previous_status: null }] }; + + const client = makeTransactionClient([ + EMPTY, // BEGIN + { rows: [makeRunRow('heartbeat')] }, // INSERT agent_compliance_runs + statusRow, // UPSERT agent_compliance_status + EMPTY, // COMMIT + ]); + mockedGetClient.mockResolvedValueOnce(client as any); + + await db.recordComplianceRun(minimalInput('heartbeat')); + + const upsertCall = client.query.mock.calls.find( + ([sql]: [string]) => typeof sql === 'string' && sql.includes('ON CONFLICT (agent_url) DO UPDATE'), + ); + expect(upsertCall).toBeDefined(); + }); + + it('owner_test write at T+1 wins over prior heartbeat — triggered_by is forwarded verbatim', async () => { + const statusRow = { rows: [{ status: 'passing', previous_status: 'passing' }] }; + + const client1 = makeTransactionClient([ + EMPTY, + { rows: [makeRunRow('heartbeat')] }, + statusRow, + EMPTY, + ]); + mockedGetClient.mockResolvedValueOnce(client1 as any); + await db.recordComplianceRun(minimalInput('heartbeat')); + + const heartbeatRunInsert = client1.query.mock.calls.find( + ([sql]: [string]) => typeof sql === 'string' && sql.includes('INSERT INTO agent_compliance_runs'), + ); + expect(heartbeatRunInsert).toBeDefined(); + expect(heartbeatRunInsert![1]).toContain('heartbeat'); + + const client2 = makeTransactionClient([ + EMPTY, + { rows: [makeRunRow('owner_test')] }, + statusRow, + EMPTY, + ]); + mockedGetClient.mockResolvedValueOnce(client2 as any); + await db.recordComplianceRun(minimalInput('owner_test')); + + const ownerTestRunInsert = client2.query.mock.calls.find( + ([sql]: [string]) => typeof sql === 'string' && sql.includes('INSERT INTO agent_compliance_runs'), + ); + expect(ownerTestRunInsert).toBeDefined(); + expect(ownerTestRunInsert![1]).toContain('owner_test'); + + const ownerTestUpsert = client2.query.mock.calls.find( + ([sql]: [string]) => typeof sql === 'string' && sql.includes('ON CONFLICT (agent_url) DO UPDATE'), + ); + expect(ownerTestUpsert).toBeDefined(); + }); + + it('heartbeat at T+3 wins over prior owner_test at T+2 — no source-priority filtering', async () => { + const statusRow = { rows: [{ status: 'passing', previous_status: 'passing' }] }; + + const client = makeTransactionClient([ + EMPTY, + { rows: [makeRunRow('heartbeat')] }, + statusRow, + EMPTY, + ]); + mockedGetClient.mockResolvedValueOnce(client as any); + await db.recordComplianceRun(minimalInput('heartbeat')); + + const runInsert = client.query.mock.calls.find( + ([sql]: [string]) => typeof sql === 'string' && sql.includes('INSERT INTO agent_compliance_runs'), + ); + expect(runInsert![1]).toContain('heartbeat'); + + const upsert = client.query.mock.calls.find( + ([sql]: [string]) => typeof sql === 'string' && sql.includes('ON CONFLICT (agent_url) DO UPDATE'), + ); + expect(upsert).toBeDefined(); + }); + + it('getComplianceStatus LATERAL join returns last_triggered_by from most recent non-dry run', async () => { + const now = new Date(); + mockedQuery.mockResolvedValueOnce({ + rows: [{ + agent_url: AGENT_URL, + status: 'passing', + lifecycle_stage: 'production', + last_checked_at: now, + last_passed_at: now, + last_failed_at: null, + streak_days: 1, + streak_started_at: now, + tracks_summary_json: { core: 'pass' }, + headline: null, + previous_status: null, + status_changed_at: null, + updated_at: now, + last_triggered_by: 'owner_test', + }], + rowCount: 1, + command: '', + oid: 0, + fields: [], + }); + + const status = await db.getComplianceStatus(AGENT_URL); + + expect(status).not.toBeNull(); + expect(status!.last_triggered_by).toBe('owner_test'); + + const [sql] = mockedQuery.mock.calls[0]; + expect(sql).toContain('dry_run = false'); + expect(sql).toContain('ORDER BY tested_at DESC'); + expect(sql).toContain('LIMIT 1'); + }); +}); From 47b26d43f60cebd1adaf189f05441072d707a822 Mon Sep 17 00:00:00 2001 From: Emma Mulitz Date: Fri, 8 May 2026 16:18:57 -0400 Subject: [PATCH 4/5] feat(dashboard): surface verdict_source + per-run triggered_by badge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR 2 of the #4247 unification stack. Reads two fields PR #4250 added to the compliance API but the dashboard wasn't yet rendering: - compliance tile: appends "(your test)" / "(heartbeat)" / "(manual)" / "(webhook)" after Last checked, so operators see whether the current verdict came from their own evaluate_agent_quality run or the scheduled heartbeat. - history panel: per-run badge with the same source label, info-blue for owner_test and neutral for the rest. Pre-PR-1 rows render with neutral — no regression. No backend changes; pure UI surfacing of fields already in the API. Stacked on PR #4250. --- .../dashboard-surfaces-verdict-source.md | 31 +++++++++++++++++++ server/public/dashboard-agents.html | 31 ++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 .changeset/dashboard-surfaces-verdict-source.md diff --git a/.changeset/dashboard-surfaces-verdict-source.md b/.changeset/dashboard-surfaces-verdict-source.md new file mode 100644 index 0000000000..e7fa44366d --- /dev/null +++ b/.changeset/dashboard-surfaces-verdict-source.md @@ -0,0 +1,31 @@ +--- +--- + +Dashboard `/dashboard/agents` surfaces the new `verdict_source` field on the +compliance tile and a per-run "Your test / Heartbeat / Manual / Webhook" +badge in the History panel. PR 2 of the #4247 unification stack — +read-side cleanup that lets owners distinguish their own on-demand +runs from scheduled heartbeat verdicts at a glance. + +**Context.** PR #4250 added `verdict_source` to +`/api/registry/agents/:url/compliance` and `triggered_by` to each row +returned by `/api/registry/agents/:url/compliance/history`. Both fields +were unrendered in the dashboard until this PR. + +**What changes.** + +- Compliance tile shows `Last checked: 3m ago (your test)` / + `(heartbeat)` / `(manual)` / `(webhook)` after the timestamp. Empty + string when `verdict_source` is null (never run). +- History panel renders a colored badge per run row: + - `Your test` (info-blue) for `triggered_by = 'owner_test'` + - `Heartbeat` (neutral) for `triggered_by = 'heartbeat'` + - `Manual` / `Webhook` (neutral) for the other enum values + +No backend changes; this is pure UI surfacing of fields the API already +emits. Pre-PR-1 rows (which only have `'heartbeat'` / `'manual'` / +`'webhook'`) render with the neutral badge — no regression. + +**Out of scope** (PR 3 of #4247): dropping `agent_test_history` and +backfilling owner-triggered rows. Tracked separately so the destructive +migration soaks behind the read-only UI change. diff --git a/server/public/dashboard-agents.html b/server/public/dashboard-agents.html index ae39c1d0a8..f8b9521219 100644 --- a/server/public/dashboard-agents.html +++ b/server/public/dashboard-agents.html @@ -1501,6 +1501,19 @@

Agents

? timeAgo(new Date(cs.last_checked_at)) : 'never'; + // Surface the verdict source so the operator knows whether the + // current status came from the scheduled heartbeat or their own + // owner-triggered test run. PR #4250 populates cs.verdict_source + // ('heartbeat' | 'owner_test' | 'manual' | 'webhook' | null when + // never run). Displayed inline with "Last checked" so the + // semantic shift on the public compliance contract is visible + // to the operator without having to read the changelog. + const verdictSourceLabel = cs.verdict_source === 'owner_test' ? ' (your test)' + : cs.verdict_source === 'heartbeat' ? ' (heartbeat)' + : cs.verdict_source === 'manual' ? ' (manual)' + : cs.verdict_source === 'webhook' ? ' (webhook)' + : ''; + const isPublic = cs.status !== 'opted_out'; return ` @@ -1527,7 +1540,7 @@

Agents

${visibilitySelectorHtml}
- Last checked: ${escapeHtml(lastChecked)} + Last checked: ${escapeHtml(lastChecked)}${escapeHtml(verdictSourceLabel)}
'; if (runTracks) { html += '
' + runTracks + '
'; From 0f0104c0de5b77ccfb5f6073150750f39ee4a771 Mon Sep 17 00:00:00 2001 From: Emma Mulitz Date: Fri, 8 May 2026 16:23:01 -0400 Subject: [PATCH 5/5] fix(addie): backfill owner test history + stop dual-write for owner runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR 3 of the #4247 unification stack. Migration 472 backfills every agent_test_history row with a user_id into agent_compliance_runs as triggered_by='owner_test', carrying the source id in observations_json.backfill_source_id for idempotent re-runs. Each agent's latest backfilled row upserts into agent_compliance_status so the dashboard immediately reflects a real verdict for agents tested through Addie pre-PR-#4250. evaluate_agent_quality stops calling recordTest() when the caller owns the agent — that was the dual-write that #4247 is closing. recordTest is retained ONLY for third-party runs so strangers testing someone else's agent still have a session-scoped audit trail. Drop of agent_test_history table is deferred behind the 14-day soak from #4250 + 7-day soak from #4263 + S3 cold-storage export of non-owner rows. Migration 472 documents this in its trailing comment. Stacked on #4263 → #4250. --- ...-owner-evaluate-agent-stop-legacy-write.md | 33 ++++ server/src/addie/mcp/member-tools.ts | 50 +++--- ..._agent_test_history_to_compliance_runs.sql | 149 ++++++++++++++++++ 3 files changed, 212 insertions(+), 20 deletions(-) create mode 100644 .changeset/unify-owner-evaluate-agent-stop-legacy-write.md create mode 100644 server/src/db/migrations/472_backfill_agent_test_history_to_compliance_runs.sql diff --git a/.changeset/unify-owner-evaluate-agent-stop-legacy-write.md b/.changeset/unify-owner-evaluate-agent-stop-legacy-write.md new file mode 100644 index 0000000000..c917d80bd2 --- /dev/null +++ b/.changeset/unify-owner-evaluate-agent-stop-legacy-write.md @@ -0,0 +1,33 @@ +--- +--- + +PR 3 of the #4247 unification stack. Two coupled changes: + +**Backfill historical owner-triggered tests into the canonical compliance +tables.** Migration `472_backfill_agent_test_history_to_compliance_runs.sql` +copies every `agent_test_history` row with a `user_id` into +`agent_compliance_runs` as `triggered_by = 'owner_test'` (carrying the +source row id in `observations_json.backfill_source_id` so a re-run is a +no-op via `WHERE NOT EXISTS`). Each agent's most-recent backfilled row +also upserts into `agent_compliance_status` so the dashboard's compliance +tile immediately reflects a real verdict for any agent that was tested +through Addie pre-PR-#4250 and never ran the heartbeat. + +**Stop the dual write for owner runs.** `evaluate_agent_quality` no longer +calls `agentContextDb.recordTest()` when the caller owns the agent — that +path was the dual-write bug #4247 is closing. The legacy `recordTest` call +is retained ONLY for third-party runs so a stranger who tests someone +else's agent still has a session-scoped audit trail in their own +`agent_test_history`. Owner-triggered runs persist exclusively to +canonical state going forward. + +**Out of scope** (deferred to a follow-up after the soak gates): + +- Drop `agent_test_history` table — gated on the 14-day soak from #4250 + deploy + 7-day soak from #4263 + S3 cold-storage export of the + remaining (`user_id IS NULL`) third-party rows. Migration 472 documents + this in its trailing comment. +- Collapse `agent_contexts.last_test_*` into a derived view — PR 4 of + the #4247 stack. + +**Stacked on** #4263 (PR 2 of #4247) → #4250 (PR 1 of #4247). diff --git a/server/src/addie/mcp/member-tools.ts b/server/src/addie/mcp/member-tools.ts index d2046c5e7c..e3b4733cf8 100644 --- a/server/src/addie/mcp/member-tools.ts +++ b/server/src/addie/mcp/member-tools.ts @@ -3613,27 +3613,37 @@ export function createMemberToolHandlers( } } - // Legacy write to agent_contexts + agent_test_history. Retained for - // backward compatibility until PR 3 migrates callers and drops the table. - try { - const context = await agentContextDb.getByOrgAndUrl(organizationId, resolved.resolvedUrl); - if (context) { - await agentContextDb.recordTest({ - agent_context_id: context.id, - scenario: 'quality_evaluation', - overall_passed: result.overall_status === 'passing', - steps_passed: result.summary.tracks_passed, - steps_failed: result.summary.tracks_failed, - total_duration_ms: result.total_duration_ms, - summary: result.summary.headline, - dry_run: true, - triggered_by: 'user', - user_id: memberContext?.workos_user?.workos_user_id, - agent_profile_json: result.agent_profile, - }); + // Legacy write to agent_contexts + agent_test_history. Retained ONLY + // for non-owner runs so a third-party who runs evaluate_agent_quality + // against someone else's agent still has a session-scoped audit trail + // (their own org's agent_test_history). Owner runs already wrote + // canonical state above (PR #4250); writing twice would split the + // audit and re-introduce the dual-write bug PR #4247 is closing. + // + // PR 4 of #4247 collapses agent_contexts.last_test_* into a derived + // view, after which this legacy block (and recordTest itself) drop + // entirely. + if (!isAgentOwner) { + try { + const context = await agentContextDb.getByOrgAndUrl(organizationId, resolved.resolvedUrl); + if (context) { + await agentContextDb.recordTest({ + agent_context_id: context.id, + scenario: 'quality_evaluation', + overall_passed: result.overall_status === 'passing', + steps_passed: result.summary.tracks_passed, + steps_failed: result.summary.tracks_failed, + total_duration_ms: result.total_duration_ms, + summary: result.summary.headline, + dry_run: true, + triggered_by: 'user', + user_id: memberContext?.workos_user?.workos_user_id, + agent_profile_json: result.agent_profile, + }); + } + } catch (error) { + logger.debug({ error }, 'Could not record quality evaluation result'); } - } catch (error) { - logger.debug({ error }, 'Could not record quality evaluation result'); } } diff --git a/server/src/db/migrations/472_backfill_agent_test_history_to_compliance_runs.sql b/server/src/db/migrations/472_backfill_agent_test_history_to_compliance_runs.sql new file mode 100644 index 0000000000..7f53b2f376 --- /dev/null +++ b/server/src/db/migrations/472_backfill_agent_test_history_to_compliance_runs.sql @@ -0,0 +1,149 @@ +-- Migration 472: backfill owner-triggered agent_test_history rows into +-- agent_compliance_runs as triggered_by='owner_test' rows. +-- +-- Part of the #4247 compliance-state unification (PR 3 of 4). PR #4250 +-- (PR 1) made evaluate_agent_quality write canonical for owner runs going +-- forward; this migration backfills the historical rows so the compliance +-- API and dashboard reflect the full test history, not just runs from the +-- PR #4250 deploy onward. +-- +-- Scope: backfill ONLY rows with user_id IS NOT NULL (real owner-triggered +-- tests). Third-party / scheduled / unattributed rows are NOT touched here +-- — the table drop is a separate follow-up that includes an S3 cold-storage +-- export of those rows so audit history isn't silently lost (see #4247 +-- Acceptance Criteria). +-- +-- Mapping: +-- agent_test_history.agent_context_id → agent_contexts.agent_url +-- agent_test_history.overall_passed → overall_status ('passing' | 'failing') +-- agent_test_history.steps_passed → tracks_passed +-- agent_test_history.steps_failed → tracks_failed +-- agent_test_history.total_duration_ms→ total_duration_ms +-- agent_test_history.summary → headline +-- agent_test_history.agent_profile_json → agent_profile_json +-- agent_test_history.started_at → tested_at +-- triggered_by → 'owner_test' (constant) +-- dry_run → false (PR #4250's owner path uses dry_run=false) +-- +-- Idempotency: backfilled rows carry the source agent_test_history.id in +-- observations_json.{backfill_source} so a re-run is a no-op via the +-- WHERE NOT EXISTS guard. + +INSERT INTO agent_compliance_runs ( + agent_url, + lifecycle_stage, + overall_status, + headline, + total_duration_ms, + tracks_json, + tracks_passed, + tracks_failed, + tracks_skipped, + tracks_partial, + agent_profile_json, + observations_json, + triggered_by, + dry_run, + tested_at +) +SELECT + ac.agent_url, + COALESCE(arm.lifecycle_stage, 'production') AS lifecycle_stage, + CASE WHEN ath.overall_passed THEN 'passing' ELSE 'failing' END AS overall_status, + ath.summary AS headline, + ath.total_duration_ms, + '[]'::jsonb AS tracks_json, + COALESCE(ath.steps_passed, 0) AS tracks_passed, + COALESCE(ath.steps_failed, 0) AS tracks_failed, + 0 AS tracks_skipped, + 0 AS tracks_partial, + ath.agent_profile_json, + jsonb_build_object( + 'backfill_source', 'agent_test_history', + 'backfill_source_id', ath.id::text, + 'backfill_migration', '472', + 'original_scenario', ath.scenario + ) AS observations_json, + 'owner_test' AS triggered_by, + FALSE AS dry_run, + ath.started_at AS tested_at +FROM agent_test_history ath +JOIN agent_contexts ac ON ac.id = ath.agent_context_id +LEFT JOIN agent_registry_metadata arm ON arm.agent_url = ac.agent_url +WHERE ath.user_id IS NOT NULL + AND NOT EXISTS ( + SELECT 1 FROM agent_compliance_runs acr + WHERE acr.observations_json->>'backfill_source_id' = ath.id::text + ); + +-- Update agent_compliance_status from the latest backfilled row per agent +-- so the dashboard immediately reflects the most recent owner-triggered +-- verdict for any agent that didn't yet have a heartbeat row. Skipped +-- when a status row already exists from a more recent heartbeat — heartbeat +-- always wins on freshness, last-write-wins is the contract pinned in +-- PR #4250's tests. +INSERT INTO agent_compliance_status ( + agent_url, + status, + lifecycle_stage, + last_checked_at, + last_passed_at, + last_failed_at, + tracks_summary_json, + headline, + status_changed_at, + last_triggered_by +) +SELECT DISTINCT ON (acr.agent_url) + acr.agent_url, + CASE WHEN acr.overall_status = 'passing' THEN 'passing' ELSE 'failing' END, + acr.lifecycle_stage, + acr.tested_at, + CASE WHEN acr.overall_status = 'passing' THEN acr.tested_at ELSE NULL END, + CASE WHEN acr.overall_status = 'failing' THEN acr.tested_at ELSE NULL END, + '{}'::jsonb, + acr.headline, + acr.tested_at, + 'owner_test' +FROM agent_compliance_runs acr +WHERE acr.observations_json->>'backfill_migration' = '472' +ORDER BY acr.agent_url, acr.tested_at DESC +ON CONFLICT (agent_url) DO UPDATE SET + status = CASE + WHEN agent_compliance_status.last_checked_at IS NULL + OR agent_compliance_status.last_checked_at < EXCLUDED.last_checked_at + THEN EXCLUDED.status + ELSE agent_compliance_status.status + END, + last_checked_at = GREATEST( + COALESCE(agent_compliance_status.last_checked_at, EXCLUDED.last_checked_at), + EXCLUDED.last_checked_at + ), + last_passed_at = CASE + WHEN EXCLUDED.last_passed_at IS NOT NULL + AND (agent_compliance_status.last_passed_at IS NULL + OR agent_compliance_status.last_passed_at < EXCLUDED.last_passed_at) + THEN EXCLUDED.last_passed_at + ELSE agent_compliance_status.last_passed_at + END, + last_failed_at = CASE + WHEN EXCLUDED.last_failed_at IS NOT NULL + AND (agent_compliance_status.last_failed_at IS NULL + OR agent_compliance_status.last_failed_at < EXCLUDED.last_failed_at) + THEN EXCLUDED.last_failed_at + ELSE agent_compliance_status.last_failed_at + END, + last_triggered_by = CASE + WHEN agent_compliance_status.last_checked_at IS NULL + OR agent_compliance_status.last_checked_at < EXCLUDED.last_checked_at + THEN EXCLUDED.last_triggered_by + ELSE agent_compliance_status.last_triggered_by + END; + +-- NOTE: this migration does NOT drop agent_test_history. The drop is +-- deferred to a follow-up migration that runs after: +-- (a) the 14-day soak window from PR #4250 deploy, +-- (b) the 7-day soak window from PR #4263 deploy, +-- (c) S3 cold-storage export of third-party rows (user_id IS NULL), +-- (d) row-count delta verification on staging. +-- See #4247 acceptance criteria.