diff --git a/.changeset/fix-storyboard-status-scenario-keys.md b/.changeset/fix-storyboard-status-scenario-keys.md new file mode 100644 index 0000000000..adc6352661 --- /dev/null +++ b/.changeset/fix-storyboard-status-scenario-keys.md @@ -0,0 +1,6 @@ +--- +--- + +Rewrites `deriveStoryboardStatuses` to read SDK 6.x's storyboard-keyed scenarios. `comply()` emits `result.tracks[].scenarios[].scenario` as `/` (one per phase), but the old implementation walked YAML steps' `comply_scenario` fields and looked up bare names like `signals_flow` / `capability_discovery` — every lookup missed, so `testedCount === 0` skipped every storyboard. Net effect: zero rows in `agent_storyboard_status` have ever been written by the compliance heartbeat. The dashboard's "X passing / Y total" was structurally `0 / N` across the registry, every declared specialism was `untested`, and the AAO Verified badge pipeline silently stopped issuing. + +New implementation groups scenarios by storyboard id, rolls per-step pass counts up from each phase's `steps` array (with phase-level fallback when steps are absent), and supports the existing `storyboardIds` override for explicit-IDs callers that need an untested entry when the runner didn't run a requested storyboard. Surfaced by escalation #329 — Evgeny's agent was passing 30/30 scenarios but showing `degraded` because the storyboard counts never updated. diff --git a/server/src/addie/services/compliance-testing.ts b/server/src/addie/services/compliance-testing.ts index c909808999..e8f5e04587 100644 --- a/server/src/addie/services/compliance-testing.ts +++ b/server/src/addie/services/compliance-testing.ts @@ -28,8 +28,6 @@ import type { TriggeredBy, } from '../../db/compliance-db.js'; -import { getStoryboard, getAllStoryboards } from '../../services/storyboards.js'; -import type { Storyboard } from '../../services/storyboards.js'; // ── Re-exports ──────────────────────────────────────────────────── @@ -227,67 +225,104 @@ function mapOverallStatus(status: string): OverallRunStatus { /** * Derive per-storyboard pass/fail from a compliance result. * - * Maps scenario results back to storyboard steps via comply_scenario. - * For explicit runs (storyboardIds provided), only those storyboards - * are evaluated. For heartbeat runs, all storyboards with matching - * scenarios are evaluated. + * `comply()` emits one `TestResult` per *phase* of each storyboard it ran, + * keyed `/` in `result.tracks[].scenarios[].scenario` + * (see `@adcp/sdk` `compliance/storyboard-tracks.ts`). We group those by + * storyboard id and roll step-level pass counts up from each phase's + * `steps` array — which is what `agent_storyboard_status.steps_passed/total` + * record. + * + * Modes: + * - heartbeat path (no `storyboardIds`): emit an entry for every storyboard + * the SDK actually produced data for. + * - explicit-IDs path (`storyboardIds` non-empty): emit one entry per id, + * with `status='untested'` for any id the SDK didn't run. + * + * `steps_passed` / `steps_total` reflect what the SDK reported for that + * storyboard in this run. Two storyboards (or the same storyboard across + * different runs) may count steps differently: most rows are real step + * counts; rows where the SDK emitted phases without per-step data fall back + * to phase-level counts. The values are meaningful within a single row + * (passed/total ratio, status derivation) but should not be compared across + * rows without checking which mode produced them. */ export function deriveStoryboardStatuses( result: ComplianceResult, storyboardIds?: string[], ): StoryboardStatusEntry[] { - // Build scenario → passed map from all track results - const scenarioResults = new Map(); - for (const track of result.tracks) { + interface Aggregate { + stepsPassed: number; + stepsTotal: number; + phasesPassed: number; + phasesTotal: number; + } + const perStoryboard = new Map(); + // Storyboard ids in `static/compliance/source/**/index.yaml` are flat + // identifiers (no `/`); splitting on the first `/` therefore always yields + // the storyboard id followed by the phase id. The `<= 0` guard also + // rejects pathological leading-slash strings. + const tracks = result.tracks ?? []; + + for (const track of tracks) { for (const s of track.scenarios) { - scenarioResults.set(s.scenario, s.overall_passed); + const sepIdx = typeof s.scenario === 'string' ? s.scenario.indexOf('/') : -1; + if (sepIdx <= 0) continue; // skip legacy bare-name scenarios (no longer emitted by storyboard-driven comply()) + const sbId = s.scenario.slice(0, sepIdx); + let agg = perStoryboard.get(sbId); + if (!agg) { + agg = { stepsPassed: 0, stepsTotal: 0, phasesPassed: 0, phasesTotal: 0 }; + perStoryboard.set(sbId, agg); + } + agg.phasesTotal++; + if (s.overall_passed) agg.phasesPassed++; + + // Roll per-step results up from the phase. Some SDK paths emit a phase + // without a `steps` array (e.g. resource-resolution failures); we then + // fall back to phase-level counts below so the storyboard still + // reports a status. + const steps = s.steps ?? []; + for (const step of steps) { + agg.stepsTotal++; + if (step.passed) agg.stepsPassed++; + } } } - if (scenarioResults.size === 0) return []; - - const storyboardsToCheck: Storyboard[] = storyboardIds - ? storyboardIds.map(id => getStoryboard(id)).filter((s): s is Storyboard => !!s) - : getAllStoryboards(); + // Decide which storyboard ids to emit entries for. + const hasExplicitIds = !!storyboardIds && storyboardIds.length > 0; + const toEmit = hasExplicitIds ? storyboardIds! : Array.from(perStoryboard.keys()); const entries: StoryboardStatusEntry[] = []; - - for (const sb of storyboardsToCheck) { - // Collect steps with comply_scenario - const testableSteps: Array<{ stepId: string; scenario: string }> = []; - for (const phase of sb.phases) { - for (const step of phase.steps) { - if (step.comply_scenario) { - testableSteps.push({ stepId: step.id, scenario: step.comply_scenario }); - } + for (const sbId of toEmit) { + const agg = perStoryboard.get(sbId); + if (!agg) { + // Explicit id requested but the runner didn't produce data for it. + if (hasExplicitIds) { + entries.push({ storyboard_id: sbId, status: 'untested', steps_passed: 0, steps_total: 0 }); } + continue; } - if (testableSteps.length === 0) continue; - - // Only include storyboards where at least one scenario was tested - const testedCount = testableSteps.filter(s => scenarioResults.has(s.scenario)).length; - if (testedCount === 0 && !storyboardIds) continue; - - const passedCount = testableSteps.filter(s => scenarioResults.get(s.scenario) === true).length; - const totalSteps = testableSteps.length; + const useSteps = agg.stepsTotal > 0; + const passed = useSteps ? agg.stepsPassed : agg.phasesPassed; + const total = useSteps ? agg.stepsTotal : agg.phasesTotal; let status: StoryboardStatusEntry['status']; - if (testedCount === 0) { + if (total === 0) { status = 'untested'; - } else if (passedCount === totalSteps) { + } else if (passed === total) { status = 'passing'; - } else if (passedCount === 0) { + } else if (passed === 0) { status = 'failing'; } else { status = 'partial'; } entries.push({ - storyboard_id: sb.id, + storyboard_id: sbId, status, - steps_passed: passedCount, - steps_total: totalSteps, + steps_passed: passed, + steps_total: total, }); } diff --git a/server/src/scripts/test-comply-storyboard-statuses.ts b/server/src/scripts/test-comply-storyboard-statuses.ts new file mode 100644 index 0000000000..2a26eda3d1 --- /dev/null +++ b/server/src/scripts/test-comply-storyboard-statuses.ts @@ -0,0 +1,95 @@ +/** + * Run `comply()` against an agent URL and print what + * `deriveStoryboardStatuses` would produce. Read-only — no DB writes. + * + * Lets us validate the new SDK-6.x scenario-key parser against real agents + * before merging. Mirrors what the compliance heartbeat does for the + * storyboard-status piece, but prints to stdout instead of recording. + * + * Usage: + * npx tsx server/src/scripts/test-comply-storyboard-statuses.ts + * npx tsx server/src/scripts/test-comply-storyboard-statuses.ts ... + */ + +import { AAO_UA_COMPLIANCE } from '../config/user-agents.js'; +import { + comply, + deriveStoryboardStatuses, + complianceResultToDbInput, + type ComplyOptions, +} from '../addie/services/compliance-testing.js'; + +const urls = process.argv.slice(2).filter(a => !a.startsWith('--')); + +if (urls.length === 0) { + console.error('Usage: test-comply-storyboard-statuses.ts [ ...]'); + process.exit(1); +} + +async function probe(agentUrl: string): Promise { + console.log(`\n${'='.repeat(80)}\nAgent: ${agentUrl}\n${'='.repeat(80)}`); + const start = Date.now(); + + const opts: ComplyOptions = { + test_session_id: `local-probe-${Date.now()}`, + timeout_ms: 90_000, + userAgent: AAO_UA_COMPLIANCE, + }; + + let result; + try { + result = await comply(agentUrl, opts); + } catch (err) { + console.log(` comply() threw: ${err instanceof Error ? err.message : String(err)}`); + return; + } + + const duration = Date.now() - start; + console.log(`\nOverall: ${result.overall_status} (${duration}ms)`); + console.log(`Headline: ${result.summary.headline}`); + console.log(`Declared specialisms: ${JSON.stringify(result.agent_profile?.specialisms ?? [])}`); + console.log(`Storyboards executed: ${JSON.stringify(result.storyboards_executed ?? '(field absent)')}`); + + console.log(`\nTracks:`); + for (const t of result.tracks) { + console.log(` ${t.track.padEnd(20)} status=${t.status.padEnd(8)} scenarios=${t.scenarios.length}`); + for (const s of t.scenarios.slice(0, 6)) { + const pass = s.overall_passed ? '✓' : '✗'; + const stepCount = s.steps?.length ?? 0; + const stepsPassed = s.steps?.filter(st => st.passed).length ?? 0; + console.log(` ${pass} ${s.scenario.padEnd(50)} steps=${stepsPassed}/${stepCount}`); + } + if (t.scenarios.length > 6) { + console.log(` … +${t.scenarios.length - 6} more`); + } + } + + console.log(`\nderiveStoryboardStatuses() output (what the heartbeat would persist):`); + const entries = deriveStoryboardStatuses(result); + if (entries.length === 0) { + console.log(` (empty — nothing to persist)`); + } else { + for (const e of entries) { + console.log(` ${e.storyboard_id.padEnd(40)} ${e.status.padEnd(10)} steps=${e.steps_passed}/${e.steps_total}`); + } + } + + console.log(`\ncomplianceResultToDbInput().storyboard_statuses (full input shape):`); + const dbInput = complianceResultToDbInput(result, agentUrl, 'production', 'manual'); + console.log(` count: ${dbInput.storyboard_statuses?.length ?? 0}`); + if (dbInput.storyboard_statuses?.length) { + console.log(JSON.stringify(dbInput.storyboard_statuses, null, 2)); + } +} + +async function main(): Promise { + for (const url of urls) { + await probe(url); + } + console.log(''); +} + +main().catch((err) => { + console.error('Probe failed:', err); + process.exit(1); +}); diff --git a/server/tests/unit/derive-storyboard-statuses.test.ts b/server/tests/unit/derive-storyboard-statuses.test.ts new file mode 100644 index 0000000000..cb136f0686 --- /dev/null +++ b/server/tests/unit/derive-storyboard-statuses.test.ts @@ -0,0 +1,241 @@ +import { describe, it, expect } from 'vitest'; +import { deriveStoryboardStatuses } from '../../src/addie/services/compliance-testing.js'; +import type { ComplianceResult } from '@adcp/sdk/testing'; + +/** + * Minimal builder for ComplianceResult fixtures. + * + * `comply()` returns one TestResult per phase of each storyboard, keyed + * `/`. The fixtures here construct that shape + * directly so the tests pin the scenario-key contract we read from the SDK. + */ +function makeResult( + scenarios: Array<{ + scenario: string; + passed: boolean; + steps?: Array<{ passed: boolean; step?: string }>; + }>, +): ComplianceResult { + return { + agent_url: 'https://example.test/mcp', + overall_status: 'passing', + tracks: [ + { + track: 'signals', + label: 'Signals', + status: 'passing', + duration_ms: 0, + skipped_scenarios: [], + observations: [], + scenarios: scenarios.map(s => ({ + agent_url: 'https://example.test/mcp', + scenario: s.scenario as unknown as ComplianceResult['tracks'][number]['scenarios'][number]['scenario'], + overall_passed: s.passed, + steps: s.steps?.map(step => ({ + step: step.step ?? 'step', + passed: step.passed, + duration_ms: 0, + })), + summary: 'fixture', + total_duration_ms: 0, + tested_at: '2026-05-11T00:00:00.000Z', + })), + }, + ], + tested_tracks: [], + skipped_tracks: [], + summary: { + tracks_passed: 0, + tracks_failed: 0, + tracks_skipped: 0, + tracks_partial: 0, + tracks_silent: 0, + headline: 'fixture', + }, + observations: [], + tested_at: '2026-05-11T00:00:00.000Z', + total_duration_ms: 0, + } as unknown as ComplianceResult; +} + +describe('deriveStoryboardStatuses', () => { + it('emits one entry per storyboard the runner produced data for', () => { + const result = makeResult([ + { scenario: 'signal_owned/capability_discovery', passed: true, steps: [{ passed: true }] }, + { scenario: 'signal_owned/discovery', passed: true, steps: [{ passed: true }, { passed: true }] }, + { scenario: 'signals_baseline/discover_and_activate', passed: true, steps: [{ passed: true }] }, + ]); + const entries = deriveStoryboardStatuses(result); + const ids = entries.map(e => e.storyboard_id).sort(); + expect(ids).toEqual(['signal_owned', 'signals_baseline']); + }); + + it('marks a storyboard passing when every phase passes (step counts roll up)', () => { + const result = makeResult([ + { scenario: 'signal_owned/capability_discovery', passed: true, steps: [{ passed: true }] }, + { scenario: 'signal_owned/discovery', passed: true, steps: [{ passed: true }, { passed: true }] }, + { scenario: 'signal_owned/activation', passed: true, steps: [{ passed: true }] }, + ]); + const [entry] = deriveStoryboardStatuses(result); + expect(entry).toEqual({ + storyboard_id: 'signal_owned', + status: 'passing', + steps_passed: 4, + steps_total: 4, + }); + }); + + it("marks a storyboard partial when some phases' steps fail", () => { + const result = makeResult([ + { scenario: 'signal_owned/capability_discovery', passed: true, steps: [{ passed: true }] }, + { scenario: 'signal_owned/discovery', passed: false, steps: [{ passed: true }, { passed: false }] }, + ]); + const [entry] = deriveStoryboardStatuses(result); + expect(entry).toMatchObject({ + storyboard_id: 'signal_owned', + status: 'partial', + steps_passed: 2, + steps_total: 3, + }); + }); + + it('marks a storyboard failing when every step failed', () => { + const result = makeResult([ + { scenario: 'signal_owned/capability_discovery', passed: false, steps: [{ passed: false }] }, + { scenario: 'signal_owned/discovery', passed: false, steps: [{ passed: false }, { passed: false }] }, + ]); + const [entry] = deriveStoryboardStatuses(result); + expect(entry).toMatchObject({ status: 'failing', steps_passed: 0, steps_total: 3 }); + }); + + it('falls back to phase-level counts when phases have no steps array', () => { + const result = makeResult([ + { scenario: 'signal_owned/capability_discovery', passed: true }, + { scenario: 'signal_owned/discovery', passed: false }, + ]); + const [entry] = deriveStoryboardStatuses(result); + expect(entry).toMatchObject({ + storyboard_id: 'signal_owned', + status: 'partial', + steps_passed: 1, + steps_total: 2, + }); + }); + + it('skips legacy bare-name scenarios (no "/" separator)', () => { + const result = makeResult([ + { scenario: 'signals_flow', passed: true, steps: [{ passed: true }] }, + { scenario: 'capability_discovery', passed: true, steps: [{ passed: true }] }, + ]); + expect(deriveStoryboardStatuses(result)).toEqual([]); + }); + + it('returns empty when no scenarios were produced', () => { + expect(deriveStoryboardStatuses(makeResult([]))).toEqual([]); + }); + + it('aggregates a storyboard whose phases appear in multiple tracks', () => { + const r = makeResult([]); + r.tracks = [ + { + track: 'core', + label: 'Core', + status: 'passing', + duration_ms: 0, + skipped_scenarios: [], + observations: [], + scenarios: [ + { + agent_url: 'https://example.test/mcp', + scenario: 'sales_non_guaranteed/capability_discovery' as never, + overall_passed: true, + steps: [{ step: 'a', passed: true, duration_ms: 0 }], + summary: '', + total_duration_ms: 0, + tested_at: '', + }, + ], + }, + { + track: 'media_buy', + label: 'Media Buy', + status: 'passing', + duration_ms: 0, + skipped_scenarios: [], + observations: [], + scenarios: [ + { + agent_url: 'https://example.test/mcp', + scenario: 'sales_non_guaranteed/create_buy' as never, + overall_passed: true, + steps: [{ step: 'b', passed: true, duration_ms: 0 }, { step: 'c', passed: false, duration_ms: 0 }], + summary: '', + total_duration_ms: 0, + tested_at: '', + }, + ], + }, + ] as unknown as ComplianceResult['tracks']; + const entries = deriveStoryboardStatuses(r); + expect(entries).toHaveLength(1); + expect(entries[0]).toMatchObject({ + storyboard_id: 'sales_non_guaranteed', + status: 'partial', + steps_passed: 2, + steps_total: 3, + }); + }); + + it('handles result.tracks being absent', () => { + const r = makeResult([]); + (r as { tracks?: unknown }).tracks = undefined; + expect(deriveStoryboardStatuses(r)).toEqual([]); + }); + + it('ignores non-string scenario values without throwing', () => { + const r = makeResult([]); + r.tracks[0].scenarios = [ + { + agent_url: 'https://example.test/mcp', + scenario: null as never, + overall_passed: true, + steps: [{ step: 'x', passed: true, duration_ms: 0 }], + summary: '', + total_duration_ms: 0, + tested_at: '', + }, + { + agent_url: 'https://example.test/mcp', + scenario: 12345 as never, + overall_passed: true, + steps: [{ step: 'y', passed: true, duration_ms: 0 }], + summary: '', + total_duration_ms: 0, + tested_at: '', + }, + ]; + expect(deriveStoryboardStatuses(r)).toEqual([]); + }); + + describe('with explicit storyboardIds', () => { + it('emits untested entry when the runner did not run a requested storyboard', () => { + const result = makeResult([ + { scenario: 'signal_owned/capability_discovery', passed: true, steps: [{ passed: true }] }, + ]); + const entries = deriveStoryboardStatuses(result, ['signal_owned', 'signal_marketplace']); + expect(entries).toEqual([ + { storyboard_id: 'signal_owned', status: 'passing', steps_passed: 1, steps_total: 1 }, + { storyboard_id: 'signal_marketplace', status: 'untested', steps_passed: 0, steps_total: 0 }, + ]); + }); + + it('only emits entries for the requested ids even when more were run', () => { + const result = makeResult([ + { scenario: 'signal_owned/p1', passed: true, steps: [{ passed: true }] }, + { scenario: 'signals_baseline/p1', passed: true, steps: [{ passed: true }] }, + ]); + const entries = deriveStoryboardStatuses(result, ['signal_owned']); + expect(entries.map(e => e.storyboard_id)).toEqual(['signal_owned']); + }); + }); +});