diff --git a/.changeset/final-cleanup-drop-test-history-and-last-test-columns.md b/.changeset/final-cleanup-drop-test-history-and-last-test-columns.md new file mode 100644 index 0000000000..3260854926 --- /dev/null +++ b/.changeset/final-cleanup-drop-test-history-and-last-test-columns.md @@ -0,0 +1,59 @@ +--- +--- + +Final cleanup of the #4247 compliance-state unification stack. Drops +`agent_test_history` and the `agent_contexts.last_test_*` columns now +that owner test runs persist canonically via +`agent_compliance_runs.triggered_by='owner_test'` (PR #4250) and +read-side derivation goes through `agent_context_with_latest_test` +(PR #4268). + +**Pre-merge gate (load-bearing — destructive migration):** + +1. PR #4250 ≥ 14 days live in prod with zero canonical-write incidents. +2. PR #4263 ≥ 7 days live in prod with the dashboard rendering identical + verdicts via the view-derived path. +3. PR #4264's migration 472 has run; row-count delta on staging is ±0 + (every owner-triggered `agent_test_history` row backfilled into + `agent_compliance_runs`). +4. Third-party (`user_id IS NULL`) rows from `agent_test_history` + exported to S3 cold storage. Export evidence committed to the ops + runbook before the migration runs. Reversibility path is the export, + not pg_dump. +5. PR #4268's view + reader migration confirmed working in prod. + +**What this PR does.** + +- **Migration 474.** Redefines `agent_context_summary` view without + references to the dropped table/columns; drops + `agent_contexts.last_test_*` columns; drops `agent_test_history` + table; refreshes `agent_context_with_latest_test` so the view's + `ac.*` projection no longer carries the removed columns. +- **`agent-context-db.ts`.** Removes `recordTest`, `getTestHistory`, + `getLatestTestForUser`, the `AgentTestHistory` interface, and the + `RecordTestInput` interface. The `last_test_*` SET branches in + `update()` go away; the method now refetches via `getById()` after + the UPDATE so derived view fields stay populated. +- **`evaluate_agent_quality`.** The third-party `recordTest()` write + path is removed. Non-owner runs are now session-scoped — they return + results in the response and do not persist. +- **`run_storyboard`.** The `recordTest()` write path is removed. + Single-storyboard runs remain session-scoped (they don't write + canonical state because that would over-state the test coverage of + a single storyboard). A future `triggered_by = 'storyboard_test'` + enum value would expand canonical writes here, but that's a separate + design discussion. + +**Behavior change.** + +- Third-party / non-owner `evaluate_agent_quality` runs against + someone else's agent no longer leave any persistent state in the + registry. Matches the "owner-only canonical writes" policy from + #4247. Stranger-runs return results to the caller in the same + shape; they just don't persist. +- `run_storyboard` runs (any caller) no longer leave persistent state + in the registry. The dashboard's "tested at" timestamps for an org + reflect only `evaluate_agent_quality` runs (which exercise the full + comply suite); single-storyboard runs are exploratory tooling. + +**Stacked on** #4268 (PR 4) → #4264 (PR 3) → #4263 (PR 2) → #4250 (PR 1). diff --git a/server/src/addie/mcp/member-tools.ts b/server/src/addie/mcp/member-tools.ts index 8d9ff72bce..515190ee73 100644 --- a/server/src/addie/mcp/member-tools.ts +++ b/server/src/addie/mcp/member-tools.ts @@ -3618,38 +3618,12 @@ export function createMemberToolHandlers( } } - // Legacy write to agent_contexts + agent_test_history. Retained ONLY - // for non-owner runs so a third-party who runs evaluate_agent_quality - // against someone else's agent still has a session-scoped audit trail - // (their own org's agent_test_history). Owner runs already wrote - // canonical state above (PR #4250); writing twice would split the - // audit and re-introduce the dual-write bug PR #4247 is closing. - // - // PR 4 of #4247 collapses agent_contexts.last_test_* into a derived - // view, after which this legacy block (and recordTest itself) drop - // entirely. - if (!isAgentOwner) { - try { - const context = await agentContextDb.getByOrgAndUrl(organizationId, resolved.resolvedUrl); - if (context) { - await agentContextDb.recordTest({ - agent_context_id: context.id, - scenario: 'quality_evaluation', - overall_passed: result.overall_status === 'passing', - steps_passed: result.summary.tracks_passed, - steps_failed: result.summary.tracks_failed, - total_duration_ms: result.total_duration_ms, - summary: result.summary.headline, - dry_run: true, - triggered_by: 'user', - user_id: memberContext?.workos_user?.workos_user_id, - agent_profile_json: result.agent_profile, - }); - } - } catch (error) { - logger.debug({ error }, 'Could not record quality evaluation result'); - } - } + // Non-owner runs are now session-scoped — they return results to + // the caller in the response and do not persist anywhere. The legacy + // recordTest path that wrote to agent_test_history was dropped in + // migration 474 (#4247 final cleanup). Strangers testing someone + // else's agent no longer leave persistent state in the registry, + // matching the "owner-only canonical writes" policy from #4247. } // Build structured output for Addie to interpret @@ -4123,33 +4097,18 @@ export function createMemberToolHandlers( return `Agent at ${resolved.resolvedUrl} requires authentication. Use \`save_agent\` to store credentials first, then try again.`; } - // Record the run in agent_test_history when we have a saved - // agent_context for this org+url. Mirrors evaluate_agent_quality's - // pattern; powers the "agent not tested in 14d" prompt rule. - // Storyboard runs don't carry a structured agent_profile (only - // evaluate_agent_quality probes get_adcp_capabilities), so we - // omit agent_profile_json — readers tolerate null. - if (organizationId) { - try { - const context = await agentContextDb.getByOrgAndUrl(organizationId, resolved.resolvedUrl); - if (context) { - await agentContextDb.recordTest({ - agent_context_id: context.id, - scenario: `storyboard:${sb.id}`, - overall_passed: result.overall_passed, - steps_passed: result.passed_count, - steps_failed: result.failed_count, - total_duration_ms: result.total_duration_ms, - summary: result.storyboard_title, - dry_run: dryRun, - triggered_by: 'user', - user_id: memberContext?.workos_user?.workos_user_id, - }); - } - } catch (error) { - logger.debug({ error }, 'Could not record storyboard run'); - } - } + // run_storyboard runs a single storyboard (vs evaluate_agent_quality's + // full comply suite). Single-storyboard runs do not currently write + // canonical state — that would require synthesizing a comply-shaped + // result with one track, which over-states the test coverage. The + // legacy recordTest call that wrote to agent_test_history was dropped + // in migration 474 (#4247 final cleanup); single-storyboard runs are + // exploratory and remain session-scoped. + // + // If a future track wants to surface storyboard runs in + // agent_compliance_runs as a distinct triggered_by value (e.g. + // 'storyboard_test'), open a follow-up — that's a schema change with + // its own design discussion. let output = ''; if (resolved.source === 'saved') output += '_Using saved credentials._\n\n'; diff --git a/server/src/db/agent-context-db.ts b/server/src/db/agent-context-db.ts index 3e72998d84..edd1fe118e 100644 --- a/server/src/db/agent-context-db.ts +++ b/server/src/db/agent-context-db.ts @@ -35,7 +35,11 @@ export interface AgentContext { // Discovery cache tools_discovered: string[] | null; last_discovered_at: Date | null; - // Test history + // Test history — derived from agent_compliance_runs via the + // `agent_context_with_latest_test` view (migration 473). The underlying + // columns on agent_contexts were dropped in migration 474; readers + // continue to see the same field names because the view aliases the + // canonical_* columns back to the original names. last_test_scenario: string | null; last_test_passed: boolean | null; last_test_summary: string | null; @@ -81,24 +85,10 @@ export interface OAuthClientCredentials { auth_method?: 'basic' | 'body'; } -export interface AgentTestHistory { - id: string; - agent_context_id: string; - scenario: string; - overall_passed: boolean; - steps_passed: number; - steps_failed: number; - total_duration_ms: number | null; - summary: string | null; - dry_run: boolean; - brief: string | null; - triggered_by: string | null; - user_id: string | null; - steps_json: any; - agent_profile_json: any; - started_at: Date; - completed_at: Date | null; -} +// AgentTestHistory interface and RecordTestInput removed in migration 474. +// agent_test_history table dropped; owner test results live in +// agent_compliance_runs (canonical) via evaluate_agent_quality's +// owner-test write path. export interface CreateAgentContextInput { organization_id: string; @@ -114,25 +104,9 @@ export interface UpdateAgentContextInput { agent_type?: AgentType; protocol?: Protocol; tools_discovered?: string[]; - last_test_scenario?: string; - last_test_passed?: boolean; - last_test_summary?: string; -} - -export interface RecordTestInput { - agent_context_id: string; - scenario: string; - overall_passed: boolean; - steps_passed: number; - steps_failed: number; - total_duration_ms?: number; - summary?: string; - dry_run?: boolean; - brief?: string; - triggered_by?: string; - user_id?: string; - steps_json?: any; - agent_profile_json?: any; + // last_test_* fields removed — derived from agent_compliance_runs via + // the agent_context_with_latest_test view (migrations 473 + 474). + // Owner test runs write to canonical state via recordComplianceRun. } function getTokenHint(token: string, authType: AuthType = 'bearer'): string { @@ -370,18 +344,10 @@ export class AgentContextDatabase { updates.push(`last_discovered_at = NOW()`); values.push(input.tools_discovered); } - if (input.last_test_scenario !== undefined) { - updates.push(`last_test_scenario = $${paramIndex++}`); - values.push(input.last_test_scenario); - } - if (input.last_test_passed !== undefined) { - updates.push(`last_test_passed = $${paramIndex++}`); - values.push(input.last_test_passed); - } - if (input.last_test_summary !== undefined) { - updates.push(`last_test_summary = $${paramIndex++}`); - values.push(input.last_test_summary); - } + // last_test_* SET branches removed — those columns no longer exist on + // agent_contexts (migration 474). Owner test runs persist via + // recordComplianceRun; the values flow back through + // agent_context_with_latest_test on read. if (updates.length === 0) { return this.getById(id); @@ -390,40 +356,17 @@ export class AgentContextDatabase { updates.push('updated_at = NOW()'); values.push(id); - const result = await query( + // UPDATE the table, then read back through the view so derived + // last_test_* fields are populated. RETURNING off the bare table no + // longer carries those fields, and refetching via getById is the + // cleanest way to keep the AgentContext shape stable for callers. + await query( `UPDATE agent_contexts SET ${updates.join(', ')} - WHERE id = $${paramIndex} - RETURNING - id, - organization_id, - agent_url, - agent_name, - agent_type, - protocol, - auth_token_encrypted IS NOT NULL as has_auth_token, - auth_token_hint, - auth_type, - oauth_access_token_encrypted IS NOT NULL as has_oauth_token, - oauth_refresh_token_encrypted IS NOT NULL as has_oauth_refresh_token, - oauth_token_expires_at, - oauth_client_id IS NOT NULL as has_oauth_client, - (oauth_cc_token_endpoint IS NOT NULL - AND oauth_cc_client_id IS NOT NULL - AND oauth_cc_client_secret_encrypted IS NOT NULL) as has_oauth_client_credentials, - tools_discovered, - last_discovered_at, - last_test_scenario, - last_test_passed, - last_test_summary, - last_tested_at, - total_tests_run, - created_at, - updated_at, - created_by`, + WHERE id = $${paramIndex}`, values ); - return result.rows[0] || null; + return this.getById(id); } /** @@ -906,103 +849,15 @@ export class AgentContextDatabase { return (result.rowCount ?? 0) > 0; } - /** - * Record a test run - */ - async recordTest(input: RecordTestInput): Promise { - // Update the agent context - await query( - `UPDATE agent_contexts - SET - last_test_scenario = $1, - last_test_passed = $2, - last_test_summary = $3, - last_tested_at = NOW(), - total_tests_run = total_tests_run + 1, - updated_at = NOW() - WHERE id = $4`, - [input.scenario, input.overall_passed, input.summary || null, input.agent_context_id] - ); - - // Insert history record - const result = await query( - `INSERT INTO agent_test_history ( - agent_context_id, - scenario, - overall_passed, - steps_passed, - steps_failed, - total_duration_ms, - summary, - dry_run, - brief, - triggered_by, - user_id, - steps_json, - agent_profile_json, - completed_at - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, NOW()) - RETURNING *`, - [ - input.agent_context_id, - input.scenario, - input.overall_passed, - input.steps_passed, - input.steps_failed, - input.total_duration_ms || null, - input.summary || null, - input.dry_run ?? true, - input.brief || null, - input.triggered_by || null, - input.user_id || null, - input.steps_json ? JSON.stringify(input.steps_json) : null, - input.agent_profile_json ? JSON.stringify(input.agent_profile_json) : null, - ] - ); - - return result.rows[0]; - } - - /** - * Get test history for an agent - */ - async getTestHistory(agentContextId: string, limit: number = 20): Promise { - const result = await query( - `SELECT * - FROM agent_test_history - WHERE agent_context_id = $1 - ORDER BY started_at DESC - LIMIT $2`, - [agentContextId, limit] - ); - return result.rows; - } - - /** - * Most recent agent test the user has run, across all of their saved - * agents. Powers the "agent not tested in X days" suggested-prompts - * rule for builder personas. - * - * Returns null if the user has never tested any registered agent. - * Tests against the public test agent or unsaved URLs are not - * recorded in agent_test_history (they don't have an agent_context), - * so they don't count here either — which is the right semantic for - * the rule's audience (builders with their own seller agent). - */ - async getLatestTestForUser(workosUserId: string): Promise<{ - started_at: Date; - overall_passed: boolean; - } | null> { - const result = await query<{ started_at: Date; overall_passed: boolean }>( - `SELECT started_at, overall_passed - FROM agent_test_history - WHERE user_id = $1 - ORDER BY started_at DESC - LIMIT 1`, - [workosUserId] - ); - return result.rows[0] ?? null; - } + // recordTest, getTestHistory, getLatestTestForUser removed in migration 474. + // + // Owner-triggered test runs persist via complianceDb.recordComplianceRun + // with triggered_by='owner_test' (PR #4250). Read-side query for "latest + // test by this user/org" goes through the agent_context_with_latest_test + // view (PR #4268), which derives last_tested_at from + // agent_compliance_runs scoped by (triggered_org_id, agent_url). + // Third-party (non-owner) runs are session-scoped only — they return + // results in the response and do not persist anywhere. /** * Infer agent type from discovered tools diff --git a/server/src/db/migrations/474_drop_agent_test_history_and_last_test_columns.sql b/server/src/db/migrations/474_drop_agent_test_history_and_last_test_columns.sql new file mode 100644 index 0000000000..b6f1d738e7 --- /dev/null +++ b/server/src/db/migrations/474_drop_agent_test_history_and_last_test_columns.sql @@ -0,0 +1,126 @@ +-- Migration 474: drop agent_test_history table + agent_contexts.last_test_* +-- columns. Final cleanup of the #4247 compliance-state unification stack. +-- +-- Pre-merge gate (load-bearing — DO NOT RUN until each is satisfied): +-- +-- 1. PR #4250 has been live in prod for ≥ 14 days with zero canonical-write +-- incidents (no malformed agent_compliance_status row, no flap reports). +-- 2. PR #4263 has been live in prod for ≥ 7 days with the dashboard +-- rendering identical verdicts via the view-derived path. +-- 3. PR #4264's migration 472 has run and the row-count delta on staging +-- is ±0 (every owner-triggered agent_test_history row backfilled into +-- agent_compliance_runs). +-- 4. Third-party (`user_id IS NULL`) rows from agent_test_history have +-- been exported to S3 cold storage. Export evidence committed to the +-- ops runbook before this migration runs. Do NOT silently lose audit +-- history. +-- 5. PR #4268's view + reader migration confirmed working in prod +-- (all callers of last_test_* read from agent_context_with_latest_test +-- via the column-aliased SELECTs). +-- +-- This migration is destructive and irreversible. Reversibility path is +-- the S3 export from gate (4), not pg_dump. + +-- ── Phase 1: drop the dependent view, redefining without legacy columns ── + +DROP VIEW IF EXISTS agent_context_summary; + +CREATE OR REPLACE VIEW agent_context_summary AS +SELECT + ac.id, + ac.organization_id, + ac.agent_url, + ac.agent_name, + ac.agent_type, + ac.protocol, + ac.auth_token_hint, + ac.auth_token_encrypted IS NOT NULL as has_auth_token, + ac.oauth_access_token_encrypted IS NOT NULL as has_oauth_token, + ac.oauth_token_expires_at, + ac.oauth_client_id IS NOT NULL as has_oauth_client, + ac.tools_discovered, + -- last_test_* fields now derived from agent_compliance_runs scoped to + -- (triggered_org_id, agent_url) rather than read off the legacy + -- agent_contexts columns. Mirrors agent_context_with_latest_test + -- (migration 473) so callers can stop using the legacy view if they + -- prefer the explicit canonical-source name. + v.canonical_last_test_scenario AS last_test_scenario, + v.canonical_last_test_passed AS last_test_passed, + v.canonical_last_test_summary AS last_test_summary, + v.canonical_last_tested_at AS last_tested_at, + v.canonical_total_tests_run AS total_tests_run, + ac.created_at, + ac.updated_at, + -- history_count / history_passed_count from agent_test_history removed — + -- the canonical-runs derivation gives total_tests_run, and a history + -- count of "passing runs" is inferable from the per-org rollup of + -- agent_compliance_runs if needed. + v.canonical_total_tests_run AS history_count, + COALESCE(( + SELECT COUNT(*) + FROM agent_compliance_runs acr + WHERE acr.triggered_org_id = ac.organization_id + AND acr.agent_url = ac.agent_url + AND acr.dry_run = FALSE + AND acr.overall_status = 'passing' + ), 0) AS history_passed_count +FROM agent_contexts ac +LEFT JOIN agent_context_with_latest_test v ON v.id = ac.id; + +COMMENT ON VIEW agent_context_summary IS + 'Agent contexts with auth info and last_test_* derived from agent_compliance_runs (canonical). Replaces the migration 195 definition that referenced the dropped agent_test_history.'; + +-- ── Phase 2: drop legacy columns from agent_contexts ── + +-- agent_context_with_latest_test (migration 473) selects ac.* — once these +-- columns drop, ac.* simply omits them. The view's aliased +-- canonical_last_test_* fields stay intact (they come from the LATERAL +-- JOIN against agent_compliance_runs). + +ALTER TABLE agent_contexts DROP COLUMN IF EXISTS last_test_scenario; +ALTER TABLE agent_contexts DROP COLUMN IF EXISTS last_test_passed; +ALTER TABLE agent_contexts DROP COLUMN IF EXISTS last_test_summary; +ALTER TABLE agent_contexts DROP COLUMN IF EXISTS last_tested_at; +ALTER TABLE agent_contexts DROP COLUMN IF EXISTS total_tests_run; + +-- ── Phase 3: drop agent_test_history ── + +-- Indexes drop with the table. +-- ON DELETE CASCADE FK from agent_contexts.id is severed when the table +-- drops; agent_contexts is unaffected. + +DROP TABLE IF EXISTS agent_test_history; + +-- ── Phase 4: refresh agent_context_with_latest_test ── +-- The view (from migration 473) selects ac.* — once last_test_* columns +-- are gone, the view's projection no longer includes them. CREATE OR +-- REPLACE re-binds the view definition cleanly. + +CREATE OR REPLACE VIEW agent_context_with_latest_test AS +SELECT + ac.*, + latest.tested_at AS canonical_last_tested_at, + latest.overall_status = 'passing' AS canonical_last_test_passed, + COALESCE( + (latest.tracks_json -> 0 ->> 'track'), + 'compliance' + ) AS canonical_last_test_scenario, + latest.headline AS canonical_last_test_summary, + COALESCE(run_counts.total, 0) AS canonical_total_tests_run +FROM agent_contexts ac +LEFT JOIN LATERAL ( + SELECT tested_at, overall_status, tracks_json, headline + FROM agent_compliance_runs acr + WHERE acr.triggered_org_id = ac.organization_id + AND acr.agent_url = ac.agent_url + AND acr.dry_run = FALSE + ORDER BY tested_at DESC + LIMIT 1 +) AS latest ON TRUE +LEFT JOIN LATERAL ( + SELECT COUNT(*)::INT AS total + FROM agent_compliance_runs acr + WHERE acr.triggered_org_id = ac.organization_id + AND acr.agent_url = ac.agent_url + AND acr.dry_run = FALSE +) AS run_counts ON TRUE;