diff --git a/benchmarks/v2/README.md b/benchmarks/v2/README.md new file mode 100644 index 0000000..d4fe54a --- /dev/null +++ b/benchmarks/v2/README.md @@ -0,0 +1,61 @@ +# SLOP Benchmarks v2 — Experiment rig (WIP) + +Successor to [`benchmarks/mcp-vs-slop`](../mcp-vs-slop/). v1 stays in place as a regression anchor; v2 turns it into a proper experiment framework so we can drive SLOP v0.2 spec decisions from data. + +Design spec: [fluffy-napping-walrus.md](../../.claude/plans/fluffy-napping-walrus.md) (local plan file). + +## Status + +- [x] Phase A — DGX inference path (OpenAI-compat provider + smoke test) +- [ ] Phase B — Sweep runner + config matrix +- [ ] Phase C — Prompt / encoding / optimization variants +- [ ] Phase C' — Fair-MCP variants +- [ ] Phase D — Metrics + statistical post-processing +- [ ] Phase E — Static dashboard +- [ ] Phase F — App complexity ladder (todo, file-browser, crm) + +## DGX Spark setup + +Models are served via Ollama on `slopinator-s-1.local`. The systemd unit has an override that binds Ollama to all interfaces on both address families: + +```ini +# /etc/systemd/system/ollama.service.d/override.conf +[Service] +Environment=OLLAMA_HOST=[::]:11434 +``` + +`::` binds IPv4 and IPv6 — required because Bun's fetch resolves `.local` names to IPv6 first and doesn't fall back. If the override is ever lost, Bun will report `ConnectionRefused` while curl still works; that's the tell. + +## Smoke test + +```bash +cd benchmarks/v2 +bun run smoke/provider-test.ts +SLOP_SMOKE_MODEL=nemotron-3-super:120b bun run smoke/provider-test.ts +``` + +Runs a multi-turn tool-calling conversation (weather lookup → answer) against the configured model. Prints per-turn token counts, latency, and whether the model successfully delivered the final answer tool-call. Fails loudly if the OpenAI-compat endpoint misbehaves. + +## Environment variables + +| Var | Default | Notes | +|---|---|---| +| `SLOP_DGX_URL` | `http://slopinator-s-1.local:11434/v1` | Override to point at a different host | +| `SLOP_SMOKE_MODEL` | `gemma4:31b` | Any model in `ollama list` | + +## Layout (target) + +``` +v2/ +├── providers/ # LlmProvider interface + adapters +│ ├── types.ts +│ └── openai-compat.ts # Ollama, vLLM, OpenAI, anything /v1-compatible +├── variants/ # prompts/, encodings/, optimizations/ (Phase C) +├── mcp-variants/ # fair-MCP pass (Phase C') +├── apps/ # todo / file-browser / issue-tracker / crm (Phase F) +├── scenarios/ # shared scenario types +├── metrics/ # collectors + stats (Phase D) +├── runner/ # sweep orchestrator (Phase B) +├── dashboard/ # static HTML report (Phase E) +└── smoke/ # validation scripts +``` diff --git a/benchmarks/v2/apps/crm/index.ts b/benchmarks/v2/apps/crm/index.ts new file mode 100644 index 0000000..97d58ee --- /dev/null +++ b/benchmarks/v2/apps/crm/index.ts @@ -0,0 +1,88 @@ +import { Client } from "@modelcontextprotocol/sdk/client"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; +import { CrmStore } from "./store.ts"; +import { seedCrm } from "./seed.ts"; +import { startCrmSlopServer, type CrmSlopOpts } from "./slop-server.ts"; +import { crmScenarios } from "./scenarios.ts"; +import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "../registry.ts"; +import type { DataScale } from "../../runner/types.ts"; +import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts"; + +function wrap(inner: CrmStore): AppStore & { inner: CrmStore } { + return { __brand: "app-store", inner } as AppStore & { inner: CrmStore }; +} + +export const crmApp: AppBinding = { + id: "crm", + supportedScales: ["s", "m", "l", "xl"], + createStore(scale, seed) { + const store = new CrmStore(); + const { contacts, deals, activities } = seedCrm(scale, seed); + store.reset(contacts, deals, activities); + return wrap(store); + }, + async startSlopServer(store, port, opts): Promise { + const inner = (store as unknown as { inner: CrmStore }).inner; + const { server, slop } = startCrmSlopServer(inner, port, opts as CrmSlopOpts | undefined); + return { + wsUrl: `ws://localhost:${port}/slop`, + stop: async () => { + slop.stop(); + server.stop(); + }, + }; + }, + scenarios: crmScenarios, + verify(store, scenario) { + if (!scenario.verify) return undefined; + const inner = (store as unknown as { inner: CrmStore }).inner; + return scenario.verify(inner as unknown as Parameters>[0]); + }, + mcpSystemPrompt: + "You are a CRM agent. You have tools to list and mutate contacts, deals, and activities. " + + "You have no prior knowledge of the data — discover it using list_* and get_* tools. " + + 'When the task is complete, respond with "DONE".', + async startMcpServer(scale: DataScale, _variant: string): Promise { + // All current MCP variants share the flat server; prompt-level variants + // are applied by the cell runner via resolveMcpVariant. + const env: Record = { ...process.env } as Record; + env.BENCH_SCALE = scale; + env.BENCH_SEED = String(42); + const serverPath = new URL("./mcp-server.ts", import.meta.url).pathname; + const transport = new StdioClientTransport({ + command: "bun", + args: ["run", serverPath], + env, + }); + const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" }); + await client.connect(transport); + return { + client, + stop: async () => { + await client.close(); + }, + verify: async (scenario: Scenario): Promise => { + if (!scenario.verify) return undefined; + // Reconstruct by listing all three entity collections. + const tempStore = new CrmStore(); + const [cRes, dRes, aRes] = await Promise.all([ + client.callTool({ name: "list_contacts", arguments: {} }), + client.callTool({ name: "list_deals", arguments: {} }), + client.callTool({ name: "list_activities", arguments: {} }), + ]); + tempStore.reset(parseJson(cRes), parseJson(dRes), parseJson(aRes)); + return scenario.verify(tempStore as unknown as Parameters>[0]); + }, + }; + }, +}; + +function parseJson(result: unknown): any[] { + const content = (result as { content?: Array<{ type: string; text?: string }> }).content ?? []; + const text = content.find((c) => c.type === "text")?.text ?? "[]"; + try { + return JSON.parse(text) ?? []; + } catch { + return []; + } +} diff --git a/benchmarks/v2/apps/crm/mcp-server.ts b/benchmarks/v2/apps/crm/mcp-server.ts new file mode 100644 index 0000000..9e9b981 --- /dev/null +++ b/benchmarks/v2/apps/crm/mcp-server.ts @@ -0,0 +1,120 @@ +/** + * Stdio MCP server for the crm benchmark app. Spawned as a child process by + * the MCP cell runner. Env vars: + * - BENCH_SCALE = s | m | l | xl + * - BENCH_SEED = integer + */ + +import { Server } from "@modelcontextprotocol/sdk/server"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; +import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js"; +import { CrmStore, type ActivityType, type DealStage } from "./store.ts"; +import { seedCrm } from "./seed.ts"; +import type { DataScale } from "../../runner/types.ts"; + +const scale = (process.env.BENCH_SCALE as DataScale | undefined) ?? "s"; +const seed = Number(process.env.BENCH_SEED ?? 42); + +const store = new CrmStore(); +const { contacts, deals, activities } = seedCrm(scale, seed); +store.reset(contacts, deals, activities); + +const server = new Server({ name: "crm-mcp", version: "0.2.0" }, { capabilities: { tools: {} } }); + +server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [ + { name: "list_contacts", description: "List every contact", inputSchema: { type: "object" as const, properties: {} } }, + { name: "list_deals", description: "List every deal. Optional filter by stage.", inputSchema: { type: "object" as const, properties: { stage: { type: "string", description: "lead|qualified|proposal|won|lost" } } } }, + { name: "list_activities", description: "List every activity. Optional filter by dealId or contactId.", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" }, contact_id: { type: "string" } } } }, + { name: "get_contact", description: "Get a contact by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } }, + { name: "get_deal", description: "Get a deal by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } }, + { name: "get_activity", description: "Get an activity by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } }, + { name: "deals_for_contact", description: "Return every deal belonging to a contact", inputSchema: { type: "object" as const, properties: { contact_id: { type: "string" } }, required: ["contact_id"] } }, + { name: "activities_for_deal", description: "Return every activity attached to a deal", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" } }, required: ["deal_id"] } }, + { name: "activities_for_contact", description: "Return every activity attached to a contact", inputSchema: { type: "object" as const, properties: { contact_id: { type: "string" } }, required: ["contact_id"] } }, + { name: "advance_deal_stage", description: "Set a deal's stage", inputSchema: { type: "object" as const, properties: { id: { type: "string" }, stage: { type: "string", description: "lead|qualified|proposal|won|lost" } }, required: ["id", "stage"] } }, + { name: "set_deal_value", description: "Set a deal's USD value", inputSchema: { type: "object" as const, properties: { id: { type: "string" }, value: { type: "number" } }, required: ["id", "value"] } }, + { name: "add_activity", description: "Create a new activity on a deal or contact. Provide deal_id XOR contact_id.", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" }, contact_id: { type: "string" }, type: { type: "string", description: "call|email|meeting|note" }, subject: { type: "string" }, body: { type: "string" } }, required: ["type", "subject", "body"] } }, + { name: "delete_contact", description: "Delete a contact", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } }, + { name: "delete_deal", description: "Delete a deal", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } }, + { name: "delete_activity", description: "Delete an activity", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } }, + ], +})); + +server.setRequestHandler(CallToolRequestSchema, async (req) => { + const { name, arguments: args } = req.params; + const a = (args ?? {}) as Record; + try { + switch (name) { + case "list_contacts": + return json(store.contacts); + case "list_deals": { + const stage = a.stage ? String(a.stage) : undefined; + const deals = stage ? store.deals.filter((d) => d.stage === stage) : store.deals; + return json(deals); + } + case "list_activities": { + let out = store.activities; + if (a.deal_id) out = out.filter((x) => x.dealId === a.deal_id); + if (a.contact_id) out = out.filter((x) => x.contactId === a.contact_id); + return json(out); + } + case "get_contact": return store.getContact(String(a.id)) ? json(store.getContact(String(a.id))) : err(`contact ${a.id} not found`); + case "get_deal": return store.getDeal(String(a.id)) ? json(store.getDeal(String(a.id))) : err(`deal ${a.id} not found`); + case "get_activity": return store.getActivity(String(a.id)) ? json(store.getActivity(String(a.id))) : err(`activity ${a.id} not found`); + case "deals_for_contact": return json(store.dealsForContact(String(a.contact_id))); + case "activities_for_deal": return json(store.activitiesForDeal(String(a.deal_id))); + case "activities_for_contact": return json(store.activitiesForContact(String(a.contact_id))); + case "advance_deal_stage": { + const stage = String(a.stage); + if (!["lead", "qualified", "proposal", "won", "lost"].includes(stage)) return err(`invalid stage ${stage}`); + store.advanceStage(String(a.id), stage as DealStage); + return json({ id: a.id, stage }); + } + case "set_deal_value": + store.setDealValue(String(a.id), Number(a.value)); + return json({ id: a.id, value: Number(a.value) }); + case "add_activity": { + const missing = ["type", "subject", "body"].filter((k) => a[k] == null); + if (missing.length > 0) return err(`missing required fields: ${missing.join(", ")}`); + const type = String(a.type); + if (!["call", "email", "meeting", "note"].includes(type)) return err(`invalid type: ${type} (expected call|email|meeting|note)`); + const dealId = a.deal_id ? String(a.deal_id) : null; + const contactId = a.contact_id ? String(a.contact_id) : null; + if (dealId && contactId) return err("provide deal_id OR contact_id, not both"); + if (!dealId && !contactId) return err("provide deal_id OR contact_id"); + const activity = store.addActivity({ + dealId, + contactId, + type: type as ActivityType, + subject: String(a.subject), + body: String(a.body), + }); + return json(activity); + } + case "delete_contact": + store.deleteContact(String(a.id)); + return json({ deleted: a.id }); + case "delete_deal": + store.deleteDeal(String(a.id)); + return json({ deleted: a.id }); + case "delete_activity": + store.deleteActivity(String(a.id)); + return json({ deleted: a.id }); + default: + return err(`unknown tool ${name}`); + } + } catch (e) { + return err(e instanceof Error ? e.message : String(e)); + } +}); + +function json(data: unknown) { + return { content: [{ type: "text", text: JSON.stringify(data) }] }; +} +function err(msg: string) { + return { content: [{ type: "text", text: JSON.stringify({ error: msg }) }], isError: true }; +} + +const transport = new StdioServerTransport(); +await server.connect(transport); diff --git a/benchmarks/v2/apps/crm/scenarios.ts b/benchmarks/v2/apps/crm/scenarios.ts new file mode 100644 index 0000000..a48e8db --- /dev/null +++ b/benchmarks/v2/apps/crm/scenarios.ts @@ -0,0 +1,152 @@ +import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts"; +import type { CrmStore } from "./store.ts"; + +const empty: Scenario["steps"] = []; + +/** + * Scenario 1 — qualify-leads: multi-entity reasoning. + * "For every deal in stage=lead that has at least one 'call' activity, + * advance it to stage=qualified." + * + * Tests the agent's ability to correlate across deals and activities before + * acting. MCP needs list_deals + list_activities; SLOP can see both in the + * tree at once. + */ +function verifyQualifyLeads(store: CrmStore): VerificationResult { + const checks: VerificationResult["checks"] = []; + for (const deal of store.deals) { + const hadCall = store.activitiesForDeal(deal.id).some((a) => a.type === "call"); + // We can't know original stage from final state alone, so we accept: + // - "was lead with call → now qualified" (correct) + // - "was not lead → unchanged" (correct; agent didn't touch) + // The stringent check: any deal still in stage=lead with a call activity + // is a miss. + if (deal.stage === "lead" && hadCall) { + checks.push({ + name: `${deal.id} lead-with-call advanced to qualified`, + passed: false, + detail: `deal still in lead with call activity`, + }); + } + } + // Positive signal: at least one deal must be in stage=qualified (the seed + // pinned two lead deals with calls; the agent must have advanced them). + const anyQualified = store.deals.some((d) => d.stage === "qualified"); + checks.push({ + name: "at least one deal advanced to qualified", + passed: anyQualified, + }); + return { passed: checks.every((c) => c.passed), checks }; +} + +/** + * Scenario 2 — high-value-alert: filter + mutate. + * "Add a note activity with subject='High value' and body='flagged' to every + * deal where valueUsd > 50000." + * + * Tests filtering-then-acting. Verifier checks: every high-value deal has a + * new 'note' activity with the exact subject; low-value deals do not. + */ +function verifyHighValueAlert(store: CrmStore): VerificationResult { + const checks: VerificationResult["checks"] = []; + const highValueDeals = store.deals.filter((d) => d.valueUsd > 50000); + for (const deal of highValueDeals) { + const hasAlert = store + .activitiesForDeal(deal.id) + .some((a) => a.type === "note" && /high\s*value/i.test(a.subject)); + checks.push({ + name: `${deal.id} flagged as high value`, + passed: hasAlert, + detail: hasAlert ? undefined : `no note with subject "High value" on deal valued $${deal.valueUsd}`, + }); + } + const lowValueDeals = store.deals.filter((d) => d.valueUsd <= 50000); + for (const deal of lowValueDeals) { + const falseAlert = store + .activitiesForDeal(deal.id) + .some((a) => a.type === "note" && /high\s*value/i.test(a.subject)); + if (falseAlert) { + checks.push({ + name: `${deal.id} should not be flagged high value`, + passed: false, + detail: `low-value deal ($${deal.valueUsd}) was incorrectly flagged`, + }); + } + } + return { passed: checks.every((c) => c.passed), checks }; +} + +/** + * Scenario 3 — contact-cleanup: orphan detection. + * "Delete every contact that has no deals and no activities." + * + * The seed guarantees two orphan contacts (`orphan-1`, `orphan-2`). The + * agent must identify and delete them. The verifier also checks that no + * non-orphan contact was deleted. + */ +function verifyContactCleanup(store: CrmStore): VerificationResult { + const orphansGone = + store.getContact("orphan-1") === undefined && store.getContact("orphan-2") === undefined; + // Count how many of the seed's non-orphan contacts survive. The seed has + // sizes.contacts main contacts; each has at least one deal, so none are + // candidates for deletion. + const remainingMain = store.contacts.filter((c) => !c.id.startsWith("orphan")).length; + // We don't know the exact seed count here, so the check is qualitative: + // any contact with existing deals/activities must remain. + const wronglyDeleted: string[] = []; + // There's no way to know if a contact was wrongly deleted without a pre- + // snapshot, but we can check that every surviving deal still has a valid + // contactId. + for (const d of store.deals) { + if (!store.getContact(d.contactId)) wronglyDeleted.push(d.contactId); + } + return { + passed: orphansGone && wronglyDeleted.length === 0, + checks: [ + { + name: "both orphan contacts deleted", + passed: orphansGone, + detail: orphansGone ? undefined : "at least one orphan contact still present", + }, + { + name: "no contact deleted that still has deals", + passed: wronglyDeleted.length === 0, + detail: wronglyDeleted.length === 0 ? undefined : `${wronglyDeleted.length} deals orphaned by contact deletion`, + }, + { + name: "non-orphan contacts preserved", + passed: remainingMain > 0, + }, + ], + }; +} + +export const crmScenarios: Scenario[] = [ + { + name: "qualify-leads", + description: "Advance every lead-stage deal that has a call activity to qualified.", + agentPrompt: + "Find every deal currently in stage 'lead' that has at least one activity of type 'call'. " + + "Advance each of those deals to stage 'qualified'. Don't touch any other deals.", + steps: empty, + verify: (store) => verifyQualifyLeads(store as unknown as CrmStore), + }, + { + name: "high-value-alert", + description: "Attach a 'High value' note to every deal worth more than $50k.", + agentPrompt: + "For every deal with a value greater than $50,000, attach a new activity of type 'note' " + + "with subject 'High value' and body 'flagged'. Don't attach anything to deals at or below $50,000.", + steps: empty, + verify: (store) => verifyHighValueAlert(store as unknown as CrmStore), + }, + { + name: "contact-cleanup", + description: "Delete every contact who has no deals and no activities.", + agentPrompt: + "Find every contact who has zero associated deals and zero associated activities. Delete those contacts. " + + "Don't delete any contact who still has deals or activities linked to them.", + steps: empty, + verify: (store) => verifyContactCleanup(store as unknown as CrmStore), + }, +]; diff --git a/benchmarks/v2/apps/crm/seed.ts b/benchmarks/v2/apps/crm/seed.ts new file mode 100644 index 0000000..21135da --- /dev/null +++ b/benchmarks/v2/apps/crm/seed.ts @@ -0,0 +1,109 @@ +import type { DataScale } from "../../runner/types.ts"; +import type { Activity, ActivityType, Contact, Deal, DealStage } from "./store.ts"; + +const COMPANIES = ["Acme Co", "Globex", "Initech", "Umbrella", "Hooli", "Stark Industries"]; +const NAMES = ["Alice", "Bob", "Carol", "Dan", "Erin", "Frank", "Grace", "Heidi", "Ivan", "Judy"]; +const ROLES = ["CEO", "CTO", "VP Sales", "Engineering Lead", "Head of Ops"]; +const STAGES: DealStage[] = ["lead", "qualified", "proposal", "won", "lost"]; +const ACTIVITY_TYPES: ActivityType[] = ["call", "email", "meeting", "note"]; + +const SIZES: Record = { + s: { contacts: 5, deals: 8, activities: 12 }, + m: { contacts: 25, deals: 40, activities: 60 }, + l: { contacts: 100, deals: 200, activities: 400 }, + xl: { contacts: 500, deals: 1000, activities: 2500 }, +}; + +function makeRng(seed: number) { + let x = seed || 0x2abcdef; + return () => { + x ^= x << 13; + x ^= x >>> 17; + x ^= x << 5; + return ((x >>> 0) % 1_000_000) / 1_000_000; + }; +} + +/** + * Deterministic seed output. Guarantees: + * - every contact has between 1 and 3 deals (so contact-cleanup always has + * candidates that match "no deals and no activities" — we inject a few + * orphan contacts past the main loop) + * - at least one deal in each stage when counts permit + * - at least one deal with valueUsd > $50k (targets for high-value-alert) + * - at least two deals in stage=lead with ≥1 'call' activity (targets for + * qualify-leads) + */ +export function seedCrm(scale: DataScale, seed: number): { + contacts: Contact[]; + deals: Deal[]; + activities: Activity[]; +} { + const rng = makeRng(seed); + const sizes = SIZES[scale]; + const contacts: Contact[] = []; + const deals: Deal[] = []; + const activities: Activity[] = []; + + for (let i = 0; i < sizes.contacts; i++) { + contacts.push({ + id: `contact-${i + 1}`, + name: `${NAMES[i % NAMES.length]} #${i + 1}`, + company: COMPANIES[Math.floor(rng() * COMPANIES.length)], + email: `person${i + 1}@example.com`, + role: ROLES[Math.floor(rng() * ROLES.length)], + }); + } + // Inject two orphan contacts at the end with no deals and no activities. + // Deleting these is the job of the `contact-cleanup` scenario. + contacts.push( + { id: "orphan-1", name: "Orphan One", company: "No Company", email: "o1@example.com", role: "N/A" }, + { id: "orphan-2", name: "Orphan Two", company: "No Company", email: "o2@example.com", role: "N/A" }, + ); + + const mainContacts = contacts.filter((c) => !c.id.startsWith("orphan")); + for (let i = 0; i < sizes.deals; i++) { + const contact = mainContacts[i % mainContacts.length]; + const stage = STAGES[Math.floor(rng() * STAGES.length)]; + // One in four deals gets pushed above $50k so high-value-alert has targets + const baseValue = 5000 + Math.floor(rng() * 30000); + const value = rng() < 0.25 ? 50000 + Math.floor(rng() * 80000) : baseValue; + deals.push({ + id: `deal-${i + 1}`, + contactId: contact.id, + title: `Contract ${i + 1} — ${contact.company}`, + valueUsd: value, + stage, + }); + } + + // Force at least two lead-stage deals with a call activity — qualify-leads targets. + if (deals.length >= 2) { + deals[0].stage = "lead"; + deals[1].stage = "lead"; + } + + for (let i = 0; i < sizes.activities; i++) { + // ~70% link to a deal, the rest to a contact + const toDeal = rng() < 0.7; + const deal = toDeal ? deals[Math.floor(rng() * deals.length)] : null; + const contact = deal ? null : mainContacts[Math.floor(rng() * mainContacts.length)]; + activities.push({ + id: `act-${i + 1}`, + dealId: deal?.id ?? null, + contactId: contact?.id ?? null, + type: ACTIVITY_TYPES[Math.floor(rng() * ACTIVITY_TYPES.length)], + subject: `Touchpoint ${i + 1}`, + body: "Follow-up notes.", + }); + } + // Force call activities on the two pinned lead deals + if (deals.length >= 2) { + activities.push( + { id: "act-seed-call-1", dealId: deals[0].id, contactId: null, type: "call", subject: "Intro call", body: "Initial conversation" }, + { id: "act-seed-call-2", dealId: deals[1].id, contactId: null, type: "call", subject: "Discovery call", body: "Scoping the opportunity" }, + ); + } + + return { contacts, deals, activities }; +} diff --git a/benchmarks/v2/apps/crm/slop-server.ts b/benchmarks/v2/apps/crm/slop-server.ts new file mode 100644 index 0000000..6b80d2e --- /dev/null +++ b/benchmarks/v2/apps/crm/slop-server.ts @@ -0,0 +1,296 @@ +import { SlopServer } from "@slop-ai/server"; +import { bunHandler } from "@slop-ai/server/bun"; +import type { NodeDescriptor } from "@slop-ai/core"; +import type { ActivityType, Contact, CrmStore, Deal, DealStage, Activity } from "./store.ts"; + +export interface CrmSlopOpts { + maxNodes?: number; + maxDepth?: number; + /** + * optimized=true: salience scoring across deals and activities, plus a + * windowed deals collection ordered by relevance (open pipeline first). + */ + optimized?: boolean; +} + +export function createCrmSlopServer(store: CrmStore, opts?: CrmSlopOpts) { + const slop = new SlopServer({ + id: "crm", + name: "CRM", + ...(opts?.maxNodes != null && { maxNodes: opts.maxNodes }), + ...(opts?.maxDepth != null && { maxDepth: opts.maxDepth }), + }); + + const optimized = opts?.optimized ?? false; + + slop.register("overview", () => { + const stageCounts = countByStage(store.deals); + const totalValue = store.deals.reduce((s, d) => s + d.valueUsd, 0); + const highValueCount = store.deals.filter((d) => d.valueUsd > 50000).length; + return { + type: "context", + props: { + contacts: store.contacts.length, + deals: store.deals.length, + activities: store.activities.length, + pipeline_value_usd: totalValue, + lead: stageCounts.lead, + qualified: stageCounts.qualified, + proposal: stageCounts.proposal, + won: stageCounts.won, + lost: stageCounts.lost, + }, + summary: + `${store.contacts.length} contacts, ${store.deals.length} deals ` + + `(${stageCounts.lead}L/${stageCounts.qualified}Q/${stageCounts.proposal}P/${stageCounts.won}W/${stageCounts.lost}⊘), ` + + `${store.activities.length} activities. ${highValueCount} deals >$50k.`, + }; + }); + + slop.register("contacts", () => { + return { + type: "collection", + props: { count: store.contacts.length }, + summary: optimized ? `${store.contacts.length} contacts` : undefined, + children: Object.fromEntries( + store.contacts.map((c) => [c.id, buildContactNode(store, slop, c, optimized)]), + ), + } satisfies NodeDescriptor; + }); + + slop.register("deals", () => { + const all = store.deals; + if (optimized) { + const scored = all.map((d) => ({ d, salience: dealSalience(d) })); + scored.sort((a, b) => b.salience - a.salience); + return { + type: "collection", + props: { count: all.length }, + summary: summarizeDeals(all), + children: Object.fromEntries( + scored.map(({ d, salience }) => [d.id, buildDealNode(store, slop, d, salience)]), + ), + } satisfies NodeDescriptor; + } + return { + type: "collection", + props: { count: all.length }, + children: Object.fromEntries(all.map((d) => [d.id, buildDealNode(store, slop, d)])), + } satisfies NodeDescriptor; + }); + + slop.register("activities", () => { + return { + type: "collection", + props: { count: store.activities.length }, + summary: optimized ? summarizeActivities(store.activities) : undefined, + children: Object.fromEntries( + store.activities.map((a) => [a.id, buildActivityNode(store, slop, a)]), + ), + } satisfies NodeDescriptor; + }); + + return slop; +} + +function countByStage(deals: Deal[]): Record { + const counts: Record = { lead: 0, qualified: 0, proposal: 0, won: 0, lost: 0 }; + for (const d of deals) counts[d.stage] += 1; + return counts; +} + +function summarizeDeals(deals: Deal[]): string { + const counts = countByStage(deals); + const highValue = deals.filter((d) => d.valueUsd > 50000).length; + return `${deals.length} deals: ${counts.lead}L/${counts.qualified}Q/${counts.proposal}P/${counts.won}W/${counts.lost}⊘, ${highValue} >$50k`; +} + +function summarizeActivities(acts: Activity[]): string { + const byType: Record = { call: 0, email: 0, meeting: 0, note: 0 }; + for (const a of acts) byType[a.type] = (byType[a.type] ?? 0) + 1; + return `${acts.length} activities: ${byType.call} calls, ${byType.email} emails, ${byType.meeting} meetings, ${byType.note} notes`; +} + +function dealSalience(d: Deal): number { + const stageScore: Record = { lead: 0.5, qualified: 0.7, proposal: 0.8, won: 0.2, lost: 0.1 }; + const valueBoost = Math.min(0.3, d.valueUsd / 500_000); + return Math.min(1, stageScore[d.stage] + valueBoost); +} + +function buildContactNode(store: CrmStore, slop: SlopServer, c: Contact, _optimized: boolean): NodeDescriptor { + return { + type: "crm:contact", + props: { + name: c.name, + company: c.company, + email: c.email, + role: c.role, + deal_count: store.dealsForContact(c.id).length, + activity_count: store.activitiesForContact(c.id).length, + }, + actions: { + edit_role: { + label: "Edit role", + description: "Change this contact's role", + params: { role: { type: "string", description: "New role" } }, + handler: async (p) => { + const target = store.getContact(c.id); + if (target) target.role = String(p.role); + slop.refresh(); + return { id: c.id }; + }, + }, + add_activity: { + label: "Log activity", + description: "Attach a new activity to this contact", + params: { + type: { type: "string", description: "call | email | meeting | note" }, + subject: { type: "string", description: "Activity subject" }, + body: { type: "string", description: "Activity body" }, + }, + handler: async (p) => { + const a = store.addActivity({ + contactId: c.id, + dealId: null, + type: String(p.type) as ActivityType, + subject: String(p.subject), + body: String(p.body), + }); + slop.refresh(); + return { id: a.id }; + }, + }, + delete: { + label: "Delete contact", + description: "Delete this contact", + params: {}, + handler: async () => { + store.deleteContact(c.id); + slop.refresh(); + return { deleted: c.id }; + }, + }, + }, + }; +} + +function buildDealNode(store: CrmStore, slop: SlopServer, d: Deal, salience?: number): NodeDescriptor { + const actions: NonNullable = { + edit_value: { + label: "Edit value", + description: "Set the deal's USD value", + params: { value: { type: "number", description: "New value in USD" } }, + handler: async (p) => { + store.setDealValue(d.id, Number(p.value)); + slop.refresh(); + return { id: d.id }; + }, + }, + add_activity: { + label: "Log activity", + description: "Attach a new activity to this deal", + params: { + type: { type: "string", description: "call | email | meeting | note" }, + subject: { type: "string", description: "Activity subject" }, + body: { type: "string", description: "Activity body" }, + }, + handler: async (p) => { + const a = store.addActivity({ + contactId: null, + dealId: d.id, + type: String(p.type) as ActivityType, + subject: String(p.subject), + body: String(p.body), + }); + slop.refresh(); + return { id: a.id }; + }, + }, + delete: { + label: "Delete deal", + description: "Delete this deal", + params: {}, + handler: async () => { + store.deleteDeal(d.id); + slop.refresh(); + return { deleted: d.id }; + }, + }, + }; + + // State-dependent stage transitions + const stageTargets: Record = { + lead: ["qualified", "lost"], + qualified: ["proposal", "lost"], + proposal: ["won", "lost"], + won: [], + lost: [], + }; + for (const target of stageTargets[d.stage]) { + const actionName = `mark_${target}`; + actions[actionName] = { + label: `Mark ${target}`, + description: `Advance this deal to stage "${target}"`, + params: {}, + handler: async () => { + store.advanceStage(d.id, target); + slop.refresh(); + return { id: d.id, stage: target }; + }, + }; + } + + const node: NodeDescriptor = { + type: "crm:deal", + props: { + contact_id: d.contactId, + title: d.title, + value_usd: d.valueUsd, + stage: d.stage, + activity_count: store.activitiesForDeal(d.id).length, + }, + actions, + }; + if (salience !== undefined) node.meta = { salience }; + return node; +} + +function buildActivityNode(store: CrmStore, slop: SlopServer, a: Activity): NodeDescriptor { + return { + type: "crm:activity", + props: { + type: a.type, + subject: a.subject, + body: a.body, + contact_id: a.contactId ?? "", + deal_id: a.dealId ?? "", + }, + actions: { + delete: { + label: "Delete activity", + description: "Delete this activity", + params: {}, + handler: async () => { + store.deleteActivity(a.id); + slop.refresh(); + return { deleted: a.id }; + }, + }, + }, + }; +} + +export function startCrmSlopServer(store: CrmStore, port: number, opts?: CrmSlopOpts) { + const slop = createCrmSlopServer(store, opts); + const handler = bunHandler(slop, { path: "/slop" }); + const server = Bun.serve({ + port, + fetch(req, srv) { + const resp = handler.fetch(req, srv); + if (resp) return resp; + return new Response("SLOP CRM benchmark server", { status: 200 }); + }, + websocket: handler.websocket, + }); + return { server, slop }; +} diff --git a/benchmarks/v2/apps/crm/store.ts b/benchmarks/v2/apps/crm/store.ts new file mode 100644 index 0000000..01ff990 --- /dev/null +++ b/benchmarks/v2/apps/crm/store.ts @@ -0,0 +1,94 @@ +export type DealStage = "lead" | "qualified" | "proposal" | "won" | "lost"; +export type ActivityType = "call" | "email" | "meeting" | "note"; + +export interface Contact { + id: string; + name: string; + company: string; + email: string; + role: string; +} + +export interface Deal { + id: string; + contactId: string; + title: string; + valueUsd: number; + stage: DealStage; +} + +export interface Activity { + id: string; + contactId: string | null; + dealId: string | null; + type: ActivityType; + subject: string; + body: string; +} + +export class CrmStore { + contacts: Contact[] = []; + deals: Deal[] = []; + activities: Activity[] = []; + + reset(contacts: Contact[], deals: Deal[], activities: Activity[]) { + this.contacts = contacts.map((c) => ({ ...c })); + this.deals = deals.map((d) => ({ ...d })); + this.activities = activities.map((a) => ({ ...a })); + } + + getContact(id: string): Contact | undefined { + return this.contacts.find((c) => c.id === id); + } + getDeal(id: string): Deal | undefined { + return this.deals.find((d) => d.id === id); + } + getActivity(id: string): Activity | undefined { + return this.activities.find((a) => a.id === id); + } + + advanceStage(dealId: string, stage: DealStage): Deal { + const d = this.getDeal(dealId); + if (!d) throw new Error(`deal ${dealId} not found`); + d.stage = stage; + return d; + } + + setDealValue(dealId: string, valueUsd: number): Deal { + const d = this.getDeal(dealId); + if (!d) throw new Error(`deal ${dealId} not found`); + d.valueUsd = valueUsd; + return d; + } + + addActivity(a: Omit & { id?: string }): Activity { + const id = a.id ?? `act-${this.activities.length + 1}`; + const activity: Activity = { id, ...a }; + this.activities.push(activity); + return activity; + } + + deleteContact(id: string): void { + this.contacts = this.contacts.filter((c) => c.id !== id); + } + + deleteDeal(id: string): void { + this.deals = this.deals.filter((d) => d.id !== id); + } + + deleteActivity(id: string): void { + this.activities = this.activities.filter((a) => a.id !== id); + } + + dealsForContact(contactId: string): Deal[] { + return this.deals.filter((d) => d.contactId === contactId); + } + + activitiesForContact(contactId: string): Activity[] { + return this.activities.filter((a) => a.contactId === contactId); + } + + activitiesForDeal(dealId: string): Activity[] { + return this.activities.filter((a) => a.dealId === dealId); + } +} diff --git a/benchmarks/v2/apps/file-browser/index.ts b/benchmarks/v2/apps/file-browser/index.ts new file mode 100644 index 0000000..5f364b6 --- /dev/null +++ b/benchmarks/v2/apps/file-browser/index.ts @@ -0,0 +1,106 @@ +import { Client } from "@modelcontextprotocol/sdk/client"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; +import { FileBrowserStore } from "./store.ts"; +import { seedFileBrowser } from "./seed.ts"; +import { startFileBrowserSlopServer, type FileBrowserSlopOpts } from "./slop-server.ts"; +import { fileBrowserScenarios } from "./scenarios.ts"; +import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "../registry.ts"; +import type { DataScale } from "../../runner/types.ts"; +import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts"; + +function wrap(inner: FileBrowserStore): AppStore & { inner: FileBrowserStore } { + return { __brand: "app-store", inner } as AppStore & { inner: FileBrowserStore }; +} + +export const fileBrowserApp: AppBinding = { + id: "file-browser", + supportedScales: ["s", "m", "l", "xl"], + createStore(scale, seed) { + const store = new FileBrowserStore(); + const { dirs, files } = seedFileBrowser(scale, seed); + store.reset(dirs, files); + return wrap(store); + }, + async startSlopServer(store, port, opts): Promise { + const inner = (store as unknown as { inner: FileBrowserStore }).inner; + const { server, slop } = startFileBrowserSlopServer(inner, port, opts as FileBrowserSlopOpts | undefined); + return { + wsUrl: `ws://localhost:${port}/slop`, + stop: async () => { + slop.stop(); + server.stop(); + }, + }; + }, + scenarios: fileBrowserScenarios, + verify(store, scenario) { + if (!scenario.verify) return undefined; + const inner = (store as unknown as { inner: FileBrowserStore }).inner; + return scenario.verify(inner as unknown as Parameters>[0]); + }, + mcpSystemPrompt: + "You are a file browser agent. You have tools to navigate a directory tree, read files, and mutate the tree. " + + "Start by calling list_dir on '/' to see the root. " + + 'When the task is complete, respond with "DONE".', + async startMcpServer(scale: DataScale, _variant: string): Promise { + // All current MCP variants share the flat server; prompt-level variants + // are applied by the cell runner via resolveMcpVariant. + const env: Record = { ...process.env } as Record; + env.BENCH_SCALE = scale; + env.BENCH_SEED = String(42); + const serverPath = new URL("./mcp-server.ts", import.meta.url).pathname; + const transport = new StdioClientTransport({ + command: "bun", + args: ["run", serverPath], + env, + }); + const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" }); + await client.connect(transport); + return { + client, + stop: async () => { + await client.close(); + }, + verify: async (scenario: Scenario): Promise => { + if (!scenario.verify) return undefined; + // Rebuild a FileBrowserStore by listing every dir and file. + const [dRes, fRes] = await Promise.all([ + client.callTool({ name: "list_all_dirs", arguments: {} }), + client.callTool({ name: "list_all_files", arguments: {} }), + ]); + const dirs = parseJson(dRes) as Array<{ path: string; child_dirs: number; child_files: number }>; + const files = parseJson(fRes) as Array<{ path: string; name: string; size_bytes: number }>; + // We need full dir relationships to verify "is empty". Do one more + // pass per dir to get their children. + const tempStore = new FileBrowserStore(); + const fullDirs = await Promise.all( + dirs.map(async (d) => { + const listRes = await client.callTool({ name: "list_dir", arguments: { path: d.path } }); + const listed = parseJson(listRes) as { dirs?: Array<{ path: string }>; files?: Array<{ path: string }> }; + return { + path: d.path, + name: d.path === "/" ? "" : d.path.slice(d.path.lastIndexOf("/") + 1), + dirs: (listed.dirs ?? []).map((x) => x.path), + files: (listed.files ?? []).map((x) => x.path), + }; + }), + ); + tempStore.reset( + fullDirs, + files.map((f) => ({ path: f.path, name: f.name, sizeBytes: f.size_bytes, content: "" })), + ); + return scenario.verify(tempStore as unknown as Parameters>[0]); + }, + }; + }, +}; + +function parseJson(result: unknown): unknown { + const content = (result as { content?: Array<{ type: string; text?: string }> }).content ?? []; + const text = content.find((c) => c.type === "text")?.text ?? "[]"; + try { + return JSON.parse(text); + } catch { + return []; + } +} diff --git a/benchmarks/v2/apps/file-browser/mcp-server.ts b/benchmarks/v2/apps/file-browser/mcp-server.ts new file mode 100644 index 0000000..a1e8a73 --- /dev/null +++ b/benchmarks/v2/apps/file-browser/mcp-server.ts @@ -0,0 +1,89 @@ +/** + * Stdio MCP server for the file-browser benchmark app. + * Env vars: BENCH_SCALE (s|m|l|xl), BENCH_SEED (int). + */ +import { Server } from "@modelcontextprotocol/sdk/server"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; +import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js"; +import { FileBrowserStore } from "./store.ts"; +import { seedFileBrowser } from "./seed.ts"; +import type { DataScale } from "../../runner/types.ts"; + +const scale = (process.env.BENCH_SCALE as DataScale | undefined) ?? "s"; +const seed = Number(process.env.BENCH_SEED ?? 42); + +const store = new FileBrowserStore(); +const { dirs, files } = seedFileBrowser(scale, seed); +store.reset(dirs, files); + +const server = new Server({ name: "file-browser-mcp", version: "0.2.0" }, { capabilities: { tools: {} } }); + +server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [ + { name: "list_dir", description: "List the direct children (dirs and files) of a directory", inputSchema: { type: "object" as const, properties: { path: { type: "string", description: "Directory path, e.g. / or /src" } }, required: ["path"] } }, + { name: "list_all_dirs", description: "List every directory in the tree, recursively", inputSchema: { type: "object" as const, properties: {} } }, + { name: "list_all_files", description: "List every file in the tree, recursively", inputSchema: { type: "object" as const, properties: {} } }, + { name: "read_file", description: "Return a file's full contents", inputSchema: { type: "object" as const, properties: { path: { type: "string", description: "File path" } }, required: ["path"] } }, + { name: "delete_file", description: "Delete a file", inputSchema: { type: "object" as const, properties: { path: { type: "string" } }, required: ["path"] } }, + { name: "delete_dir", description: "Delete a directory (must be empty)", inputSchema: { type: "object" as const, properties: { path: { type: "string" } }, required: ["path"] } }, + { name: "create_dir", description: "Create a new empty directory as a child of another", inputSchema: { type: "object" as const, properties: { parent: { type: "string" }, name: { type: "string" } }, required: ["parent", "name"] } }, + { name: "rename_file", description: "Rename a file (keeps it in the same directory)", inputSchema: { type: "object" as const, properties: { path: { type: "string" }, new_name: { type: "string" } }, required: ["path", "new_name"] } }, + { name: "move_file", description: "Move a file into another directory", inputSchema: { type: "object" as const, properties: { path: { type: "string" }, new_parent: { type: "string" } }, required: ["path", "new_parent"] } }, + ], +})); + +server.setRequestHandler(CallToolRequestSchema, async (req) => { + const { name, arguments: args } = req.params; + const a = (args ?? {}) as Record; + try { + switch (name) { + case "list_dir": { + const d = store.getDir(String(a.path)); + if (!d) return err(`dir ${a.path} not found`); + return json({ + path: d.path, + dirs: d.dirs.map((p) => ({ path: p, name: store.getDir(p)?.name, is_empty: store.isDirEmpty(p) })), + files: d.files.map((p) => ({ path: p, name: store.getFile(p)?.name, size_bytes: store.getFile(p)?.sizeBytes })), + }); + } + case "list_all_dirs": + return json(store.listDirs().map((d) => ({ path: d.path, is_empty: store.isDirEmpty(d.path), child_dirs: d.dirs.length, child_files: d.files.length }))); + case "list_all_files": + return json(store.listFiles().map((f) => ({ path: f.path, name: f.name, size_bytes: f.sizeBytes }))); + case "read_file": { + const f = store.getFile(String(a.path)); + if (!f) return err(`file ${a.path} not found`); + return json({ path: f.path, content: f.content }); + } + case "delete_file": + store.deleteFile(String(a.path)); + return json({ deleted: a.path }); + case "delete_dir": + store.deleteDir(String(a.path)); + return json({ deleted: a.path }); + case "create_dir": + store.createDir(String(a.parent), String(a.name)); + return json({ created: `${a.parent}/${a.name}` }); + case "rename_file": + store.renameFile(String(a.path), String(a.new_name)); + return json({ renamed: a.path, to: a.new_name }); + case "move_file": + store.moveFile(String(a.path), String(a.new_parent)); + return json({ moved: a.path, to: a.new_parent }); + default: + return err(`unknown tool ${name}`); + } + } catch (e) { + return err(e instanceof Error ? e.message : String(e)); + } +}); + +function json(data: unknown) { + return { content: [{ type: "text", text: JSON.stringify(data) }] }; +} +function err(msg: string) { + return { content: [{ type: "text", text: JSON.stringify({ error: msg }) }], isError: true }; +} + +const transport = new StdioServerTransport(); +await server.connect(transport); diff --git a/benchmarks/v2/apps/file-browser/scenarios.ts b/benchmarks/v2/apps/file-browser/scenarios.ts new file mode 100644 index 0000000..785624e --- /dev/null +++ b/benchmarks/v2/apps/file-browser/scenarios.ts @@ -0,0 +1,68 @@ +import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts"; +import type { FileBrowserStore } from "./store.ts"; + +const empty: Scenario["steps"] = []; + +/** + * find-readme — tests depth-first exploration. The agent must locate a file + * named README.md and report its contents. We verify by checking the store's + * "reports" tracking (or simpler: the scenario's `verify` just confirms the + * file still exists; we don't validate the agent's final answer. The real + * test is whether the agent *can* read it at all, reflected in toolCalls / + * specComplianceRate metrics). + */ +function verifyReadmeExists(store: FileBrowserStore): VerificationResult { + // Placeholder verifier — the whole point of find-and-read is that the + // agent reaches the file, which is captured by tool-call metrics, not by + // store mutations. Scoring happens via a reports collection below. + const readme = store.getFile("/README.md"); + return { + passed: readme !== undefined, + checks: [ + { + name: "README.md still in tree", + passed: readme !== undefined, + }, + ], + }; +} + +/** + * delete-empty-dirs — tests state-dependent affordances. The agent must find + * every empty directory and delete it. The seed guarantees at least one + * empty dir ("/empty"). Verifier checks that no empty dirs remain. + */ +function verifyNoEmptyDirs(store: FileBrowserStore): VerificationResult { + const empties = store.listDirs().filter((d) => d.path !== "/" && store.isDirEmpty(d.path)); + return { + passed: empties.length === 0, + checks: [ + { + name: "no empty directories remain", + passed: empties.length === 0, + detail: empties.length === 0 ? undefined : `still empty: ${empties.map((d) => d.path).join(", ")}`, + }, + ], + }; +} + +export const fileBrowserScenarios: Scenario[] = [ + { + name: "find-readme", + description: "Locate and read the contents of README.md.", + agentPrompt: + "Find the file named exactly 'README.md' in the file tree and read its contents. " + + 'Then respond with "DONE".', + steps: empty, + verify: (store) => verifyReadmeExists(store as unknown as FileBrowserStore), + }, + { + name: "delete-empty-dirs", + description: "Delete every empty directory (excluding the root).", + agentPrompt: + "Delete every empty directory in the tree. An empty directory contains no files and no subdirectories. " + + "Do not delete the root directory. Do not delete any directory that still has files or subdirectories.", + steps: empty, + verify: (store) => verifyNoEmptyDirs(store as unknown as FileBrowserStore), + }, +]; diff --git a/benchmarks/v2/apps/file-browser/seed.ts b/benchmarks/v2/apps/file-browser/seed.ts new file mode 100644 index 0000000..bf7dfbd --- /dev/null +++ b/benchmarks/v2/apps/file-browser/seed.ts @@ -0,0 +1,106 @@ +import type { DataScale } from "../../runner/types.ts"; +import type { DirNode, FileNode } from "./store.ts"; +import { joinPath } from "./store.ts"; + +/** + * Deterministic file-tree seed. Shape grows both wider and deeper with + * scale so the size-vs-depth axis is exercised. + * + * - s: depth 2, ~8 files + * - m: depth 3, ~30 files + * - l: depth 4, ~120 files + * - xl: depth 5, ~500 files + * + * Guarantees (for scenario verifiers): + * - Exactly one file named "README.md" somewhere in the tree + * - At least one empty directory + * - At least 3 files with ".log" extension + */ +const SHAPES: Record = { + s: { depth: 2, dirsPerLevel: 2, filesPerDir: 2 }, + m: { depth: 3, dirsPerLevel: 3, filesPerDir: 2 }, + l: { depth: 4, dirsPerLevel: 3, filesPerDir: 3 }, + xl: { depth: 5, dirsPerLevel: 3, filesPerDir: 4 }, +}; + +const NAMES = ["src", "lib", "tests", "docs", "assets", "build", "dist", "config", "scripts", "examples"]; +const FILE_NAMES = ["main.ts", "util.ts", "index.html", "styles.css", "data.json", "notes.md", "debug.log"]; + +function makeRng(seed: number) { + let x = seed || 0x3cafeba; + return () => { + x ^= x << 13; + x ^= x >>> 17; + x ^= x << 5; + return ((x >>> 0) % 1_000_000) / 1_000_000; + }; +} + +export function seedFileBrowser(scale: DataScale, seed: number): { dirs: DirNode[]; files: FileNode[] } { + const rng = makeRng(seed); + const shape = SHAPES[scale]; + const dirs = new Map(); + const files: FileNode[] = []; + + // Root + const root: DirNode = { path: "/", name: "", dirs: [], files: [] }; + dirs.set("/", root); + + // README.md at the root — guarantees a discoverable file for find-and-read. + const readme: FileNode = { + path: "/README.md", + name: "README.md", + sizeBytes: 128, + content: "SLOP benchmarks v2 — file-browser sample tree. Look for README at the root.", + }; + files.push(readme); + root.files.push(readme.path); + + buildLevel(root, 0, shape, rng, dirs, files); + + // Guarantee an empty dir at the root: one named "empty". + const empty: DirNode = { path: "/empty", name: "empty", dirs: [], files: [] }; + dirs.set(empty.path, empty); + root.dirs.push(empty.path); + + // Guarantee at least 3 .log files — seed them under the first child dir. + const firstChild = dirs.get(root.dirs.find((p) => p !== "/empty") ?? "/empty"); + if (firstChild) { + for (let i = 0; i < 3; i++) { + const name = `run-${i + 1}.log`; + const path = joinPath(firstChild.path, name); + files.push({ path, name, sizeBytes: 512, content: `log output ${i + 1}` }); + firstChild.files.push(path); + } + } + + return { dirs: Array.from(dirs.values()), files }; +} + +function buildLevel( + parent: DirNode, + depth: number, + shape: { depth: number; dirsPerLevel: number; filesPerDir: number }, + rng: () => number, + dirs: Map, + files: FileNode[], +) { + if (depth >= shape.depth) return; + const dirCount = Math.max(1, Math.round(shape.dirsPerLevel * (0.7 + rng() * 0.6))); + for (let i = 0; i < dirCount; i++) { + const name = `${NAMES[Math.floor(rng() * NAMES.length)]}-${i + 1}`; + const path = joinPath(parent.path, name); + const dir: DirNode = { path, name, dirs: [], files: [] }; + dirs.set(path, dir); + parent.dirs.push(path); + // Add files to this directory + const fileCount = Math.max(1, Math.round(shape.filesPerDir * (0.6 + rng() * 0.8))); + for (let j = 0; j < fileCount; j++) { + const fileName = `${FILE_NAMES[Math.floor(rng() * FILE_NAMES.length)].replace(/\.(\w+)$/, `-${j + 1}.$1`)}`; + const fpath = joinPath(path, fileName); + files.push({ path: fpath, name: fileName, sizeBytes: 256 + Math.floor(rng() * 2048), content: `// file ${fpath}` }); + dir.files.push(fpath); + } + buildLevel(dir, depth + 1, shape, rng, dirs, files); + } +} diff --git a/benchmarks/v2/apps/file-browser/slop-server.ts b/benchmarks/v2/apps/file-browser/slop-server.ts new file mode 100644 index 0000000..a29acfd --- /dev/null +++ b/benchmarks/v2/apps/file-browser/slop-server.ts @@ -0,0 +1,195 @@ +import { SlopServer } from "@slop-ai/server"; +import { bunHandler } from "@slop-ai/server/bun"; +import type { NodeDescriptor } from "@slop-ai/core"; +import type { DirNode, FileBrowserStore, FileNode } from "./store.ts"; + +export interface FileBrowserSlopOpts { + maxNodes?: number; + maxDepth?: number; + /** + * optimized=true: directories beyond depth 2 become lazy stubs (no inline + * children; agent must slop_query them). off=false inlines the whole tree. + */ + optimized?: boolean; +} + +/** + * The path-to-SLOP mapping strips the leading "/" and uses the result as + * the register key. Root ("/") is registered as "tree" so there's always a + * canonical entry point above the first real directory. + */ +function slopPath(storePath: string): string { + if (storePath === "/") return "tree"; + return `tree${storePath}`; +} + +export function createFileBrowserSlopServer(store: FileBrowserStore, opts?: FileBrowserSlopOpts) { + const slop = new SlopServer({ + id: "file-browser", + name: "File Browser", + ...(opts?.maxNodes != null && { maxNodes: opts.maxNodes }), + ...(opts?.maxDepth != null && { maxDepth: opts.maxDepth }), + }); + + const optimized = opts?.optimized ?? false; + + slop.register("overview", () => { + const totalDirs = store.listDirs().length; + const totalFiles = store.listFiles().length; + const emptyDirs = store.listDirs().filter((d) => d.path !== "/" && store.isDirEmpty(d.path)).length; + return { + type: "context", + props: { + total_dirs: totalDirs, + total_files: totalFiles, + empty_dirs: emptyDirs, + }, + summary: `${totalDirs} directories (${emptyDirs} empty), ${totalFiles} files`, + }; + }); + + const registerDir = (dir: DirNode, depth: number) => { + slop.register(slopPath(dir.path), () => { + const current = store.getDir(dir.path); + if (!current) return { type: "missing" } satisfies NodeDescriptor; + const isDeep = optimized && depth >= 2; + const children: Record = {}; + if (!isDeep) { + for (const childDirPath of current.dirs) { + const child = store.getDir(childDirPath); + if (child) children[child.name] = buildDirStub(child, store); + } + for (const filePath of current.files) { + const file = store.getFile(filePath); + if (file) children[file.name] = buildFileNode(store, slop, file); + } + } + const node: NodeDescriptor = { + type: "dir", + props: { + path: current.path, + child_dirs: current.dirs.length, + child_files: current.files.length, + is_empty: store.isDirEmpty(current.path), + }, + summary: isDeep + ? `${current.dirs.length} subdirs, ${current.files.length} files (lazy — use slop_query to load)` + : undefined, + actions: { + create_file: { + label: "Create file", + description: "Create a new file inside this directory", + params: { + name: { type: "string", description: "New file name" }, + content: { type: "string", description: "File contents" }, + }, + handler: async (p) => { + // No first-class createFile on the store — not needed by current scenarios. + slop.refresh(); + return { error: "create_file not supported" }; + }, + }, + create_subdir: { + label: "Create subdirectory", + description: "Create a new empty directory inside this one", + params: { name: { type: "string", description: "New directory name" } }, + handler: async (p) => { + store.createDir(current.path, String(p.name)); + slop.refresh(); + return { id: current.path }; + }, + }, + }, + }; + // State-dependent: delete is only available when the dir is empty and non-root. + if (current.path !== "/" && store.isDirEmpty(current.path)) { + node.actions!.delete = { + label: "Delete empty directory", + description: "Delete this directory (only available when empty)", + params: {}, + handler: async () => { + store.deleteDir(current.path); + slop.refresh(); + return { deleted: current.path }; + }, + }; + } + if (!isDeep) { + node.children = children; + } + return node; + }); + + if (optimized && depth >= 2) return; + for (const childPath of dir.dirs) { + const child = store.getDir(childPath); + if (child) registerDir(child, depth + 1); + } + }; + + const root = store.getDir("/"); + if (root) registerDir(root, 0); + + return slop; +} + +function buildDirStub(dir: DirNode, store: FileBrowserStore): NodeDescriptor { + return { + type: "dir-stub", + props: { + path: dir.path, + name: dir.name, + is_empty: store.isDirEmpty(dir.path), + }, + summary: `${dir.dirs.length} subdirs, ${dir.files.length} files`, + }; +} + +function buildFileNode(store: FileBrowserStore, slop: SlopServer, file: FileNode): NodeDescriptor { + return { + type: "file", + props: { + path: file.path, + name: file.name, + size_bytes: file.sizeBytes, + // Small preview only — the real content comes via the read_file affordance. + preview: file.content.slice(0, 80), + }, + actions: { + read_file: { + label: "Read file", + description: "Return the full contents of this file", + params: {}, + handler: async () => { + const current = store.getFile(file.path); + return { path: file.path, content: current?.content ?? "" }; + }, + }, + delete_file: { + label: "Delete file", + description: "Delete this file", + params: {}, + handler: async () => { + store.deleteFile(file.path); + slop.refresh(); + return { deleted: file.path }; + }, + }, + }, + }; +} + +export function startFileBrowserSlopServer(store: FileBrowserStore, port: number, opts?: FileBrowserSlopOpts) { + const slop = createFileBrowserSlopServer(store, opts); + const handler = bunHandler(slop, { path: "/slop" }); + const server = Bun.serve({ + port, + fetch(req, srv) { + const resp = handler.fetch(req, srv); + if (resp) return resp; + return new Response("SLOP File Browser benchmark server", { status: 200 }); + }, + websocket: handler.websocket, + }); + return { server, slop }; +} diff --git a/benchmarks/v2/apps/file-browser/store.ts b/benchmarks/v2/apps/file-browser/store.ts new file mode 100644 index 0000000..ad1cd52 --- /dev/null +++ b/benchmarks/v2/apps/file-browser/store.ts @@ -0,0 +1,122 @@ +export interface FileNode { + path: string; + name: string; + sizeBytes: number; + content: string; +} + +export interface DirNode { + path: string; + name: string; + dirs: string[]; // child dir paths + files: string[]; // child file paths +} + +/** + * Deep tree store. Paths are unix-style absolute strings rooted at "/". + * All operations are path-indexed so the SLOP server can register nodes at + * arbitrary depths without tracking parent pointers separately. + */ +export class FileBrowserStore { + dirs = new Map(); + files = new Map(); + + reset(dirs: DirNode[], files: FileNode[]) { + this.dirs.clear(); + this.files.clear(); + for (const d of dirs) this.dirs.set(d.path, { ...d, dirs: [...d.dirs], files: [...d.files] }); + for (const f of files) this.files.set(f.path, { ...f }); + } + + getDir(path: string): DirNode | undefined { + return this.dirs.get(path); + } + + getFile(path: string): FileNode | undefined { + return this.files.get(path); + } + + listDirs(): DirNode[] { + return Array.from(this.dirs.values()); + } + + listFiles(): FileNode[] { + return Array.from(this.files.values()); + } + + isDirEmpty(path: string): boolean { + const d = this.dirs.get(path); + if (!d) return false; + return d.dirs.length === 0 && d.files.length === 0; + } + + deleteFile(path: string): void { + const f = this.files.get(path); + if (!f) throw new Error(`file ${path} not found`); + this.files.delete(path); + const parent = this.dirs.get(parentDir(path)); + if (parent) parent.files = parent.files.filter((p) => p !== path); + } + + deleteDir(path: string): void { + const d = this.dirs.get(path); + if (!d) throw new Error(`dir ${path} not found`); + if (!this.isDirEmpty(path)) throw new Error(`dir ${path} is not empty`); + this.dirs.delete(path); + const parent = this.dirs.get(parentDir(path)); + if (parent) parent.dirs = parent.dirs.filter((p) => p !== path); + } + + renameFile(path: string, newName: string): FileNode { + const f = this.files.get(path); + if (!f) throw new Error(`file ${path} not found`); + const parent = parentDir(path); + const newPath = joinPath(parent, newName); + if (this.files.has(newPath) || this.dirs.has(newPath)) throw new Error(`path ${newPath} already exists`); + this.files.delete(path); + const updated: FileNode = { ...f, path: newPath, name: newName }; + this.files.set(newPath, updated); + const parentNode = this.dirs.get(parent); + if (parentNode) parentNode.files = parentNode.files.map((p) => (p === path ? newPath : p)); + return updated; + } + + moveFile(path: string, newParentPath: string): FileNode { + const f = this.files.get(path); + if (!f) throw new Error(`file ${path} not found`); + const newParent = this.dirs.get(newParentPath); + if (!newParent) throw new Error(`dir ${newParentPath} not found`); + const oldParent = this.dirs.get(parentDir(path)); + const newPath = joinPath(newParentPath, f.name); + if (this.files.has(newPath) || this.dirs.has(newPath)) throw new Error(`path ${newPath} already exists`); + this.files.delete(path); + const updated: FileNode = { ...f, path: newPath }; + this.files.set(newPath, updated); + if (oldParent) oldParent.files = oldParent.files.filter((p) => p !== path); + newParent.files.push(newPath); + return updated; + } + + createDir(parentPath: string, name: string): DirNode { + const parent = this.dirs.get(parentPath); + if (!parent) throw new Error(`dir ${parentPath} not found`); + const newPath = joinPath(parentPath, name); + if (this.dirs.has(newPath) || this.files.has(newPath)) throw new Error(`path ${newPath} already exists`); + const d: DirNode = { path: newPath, name, dirs: [], files: [] }; + this.dirs.set(newPath, d); + parent.dirs.push(newPath); + return d; + } +} + +export function parentDir(path: string): string { + if (path === "/") return "/"; + const idx = path.lastIndexOf("/"); + if (idx === 0) return "/"; + return path.slice(0, idx); +} + +export function joinPath(parent: string, name: string): string { + if (parent === "/") return `/${name}`; + return `${parent}/${name}`; +} diff --git a/benchmarks/v2/apps/issue-tracker.ts b/benchmarks/v2/apps/issue-tracker.ts new file mode 100644 index 0000000..8978d3e --- /dev/null +++ b/benchmarks/v2/apps/issue-tracker.ts @@ -0,0 +1,134 @@ +import { Client } from "@modelcontextprotocol/sdk/client"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; +import { IssueTrackerStore } from "../../mcp-vs-slop/app/store.ts"; +import { createSeedData, createLargeSeedData } from "../../mcp-vs-slop/app/seed.ts"; +import { startSlopServer, type SlopServerOpts } from "../../mcp-vs-slop/app/slop-server.ts"; +import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "./registry.ts"; +import type { DataScale } from "../runner/types.ts"; +import type { Scenario, VerificationResult } from "../../mcp-vs-slop/scenarios/types.ts"; + +import { exploreAndAct } from "../../mcp-vs-slop/scenarios/explore-and-act.ts"; +import { triage } from "../../mcp-vs-slop/scenarios/triage.ts"; +import { bulkUpdate } from "../../mcp-vs-slop/scenarios/bulk-update.ts"; +import { scaleTriage } from "../../mcp-vs-slop/scenarios/scale-triage.ts"; +import { negative } from "../../mcp-vs-slop/scenarios/negative.ts"; +import { contextual } from "../../mcp-vs-slop/scenarios/contextual.ts"; +import { recovery } from "../../mcp-vs-slop/scenarios/recovery.ts"; +import { stateTransitions } from "../../mcp-vs-slop/scenarios/state-transitions.ts"; +import { crossEntity } from "../../mcp-vs-slop/scenarios/cross-entity.ts"; +import { conditional } from "../../mcp-vs-slop/scenarios/conditional.ts"; +import { ambiguity } from "../../mcp-vs-slop/scenarios/ambiguity.ts"; +import { complexWorkflow } from "../../mcp-vs-slop/scenarios/complex-workflow.ts"; + +// v1 exposes only two seed sizes; we map them to the v2 scale axis. Phase F +// will grow the app's own generators so `m` / `xl` become supported. +function seedForScale(scale: DataScale) { + switch (scale) { + case "s": + return createSeedData(); + case "l": + return createLargeSeedData(); + default: + throw new Error(`issue-tracker: scale "${scale}" not yet supported (supported: s, l)`); + } +} + +function wrap(inner: IssueTrackerStore): AppStore & { inner: IssueTrackerStore } { + return { __brand: "app-store", inner } as AppStore & { inner: IssueTrackerStore }; +} + +export const issueTrackerApp: AppBinding = { + id: "issue-tracker", + supportedScales: ["s", "l"], + createStore(scale, _seed) { + const store = new IssueTrackerStore(); + store.reset(seedForScale(scale)); + return wrap(store); + }, + async startSlopServer(store, port, opts: SlopServerOpts | undefined): Promise { + const inner = (store as unknown as { inner: IssueTrackerStore }).inner; + const { server: httpServer, slop } = startSlopServer(inner, port, opts); + return { + wsUrl: `ws://localhost:${port}/slop`, + stop: async () => { + slop.stop(); + httpServer.stop(); + }, + }; + }, + scenarios: [ + exploreAndAct, + triage, + bulkUpdate, + scaleTriage, + negative, + contextual, + recovery, + stateTransitions, + crossEntity, + conditional, + ambiguity, + complexWorkflow, + ], + verify(store, scenario) { + if (!scenario.verify) return undefined; + const inner = (store as unknown as { inner: IssueTrackerStore }).inner; + return scenario.verify(inner); + }, + mcpSystemPrompt: + "You are an issue tracker agent. You have access to tools to interact with repositories, issues, and comments. " + + "You have NO prior knowledge of the data — use the tools to discover the current state. " + + 'When done, respond with "DONE".', + async startMcpServer(scale: DataScale, _variant: string): Promise { + // All current MCP variants share the flat server; prompt-level variants + // are applied by the cell runner via resolveMcpVariant. + const env: Record = { ...process.env } as Record; + if (scale === "l") env.BENCH_LARGE_DATASET = "1"; + else if (scale === "s") delete env.BENCH_LARGE_DATASET; + else throw new Error(`issue-tracker mcp: scale "${scale}" not supported`); + + const serverPath = new URL("../../mcp-vs-slop/app/mcp-server.ts", import.meta.url).pathname; + const transport = new StdioClientTransport({ + command: "bun", + args: ["run", serverPath], + env, + }); + const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" }); + await client.connect(transport); + + return { + client, + stop: async () => { + await client.close(); + }, + verify: async (scenario: Scenario): Promise => { + if (!scenario.verify) return undefined; + const tempStore = new IssueTrackerStore(); + const reposRes = await client.callTool({ name: "list_repos", arguments: {} }); + const repos = parseToolJson(reposRes, []); + tempStore.repos = repos as IssueTrackerStore["repos"]; + for (const repo of repos as Array<{ id: string }>) { + const issuesRes = await client.callTool({ name: "list_issues", arguments: { repo_id: repo.id } }); + const issues = parseToolJson(issuesRes, []); + tempStore.issues.push(...(issues as IssueTrackerStore["issues"])); + } + for (const issue of tempStore.issues) { + const commentsRes = await client.callTool({ name: "list_comments", arguments: { issue_id: issue.id } }); + const comments = parseToolJson(commentsRes, []); + tempStore.comments.push(...(comments as IssueTrackerStore["comments"])); + } + return scenario.verify(tempStore); + }, + }; + }, +}; + +function parseToolJson(result: unknown, fallback: unknown): unknown { + const content = (result as { content?: Array<{ type: string; text?: string }> }).content ?? []; + const text = content.find((c) => c.type === "text")?.text ?? ""; + try { + return JSON.parse(text); + } catch { + return fallback; + } +} diff --git a/benchmarks/v2/apps/registry.ts b/benchmarks/v2/apps/registry.ts new file mode 100644 index 0000000..831e62d --- /dev/null +++ b/benchmarks/v2/apps/registry.ts @@ -0,0 +1,66 @@ +import type { Client } from "@modelcontextprotocol/sdk/client"; +import type { SlopServerOpts } from "../../mcp-vs-slop/app/slop-server.ts"; +import type { Scenario, VerificationResult } from "../../mcp-vs-slop/scenarios/types.ts"; +import type { AppId, DataScale } from "../runner/types.ts"; +import { crmApp } from "./crm/index.ts"; +import { fileBrowserApp } from "./file-browser/index.ts"; +import { issueTrackerApp } from "./issue-tracker.ts"; +import { todoApp } from "./todo/index.ts"; + +/** + * Store + server + scenarios for a given benchmark app. Each app is a tuple + * of (storeFactory, serverLauncher, scenarios) with a declared set of + * supported data scales. The sweep runner skips cells whose (app, scale) + * combination isn't supported. + */ +export interface AppBinding { + id: AppId; + supportedScales: DataScale[]; + /** Build a fresh store seeded for the requested scale. */ + createStore(scale: DataScale, seed: number): AppStore; + /** Boot a SLOP server exposing the given store. Returns stop() + URL. */ + startSlopServer(store: AppStore, port: number, opts: SlopServerOpts | undefined): Promise; + /** Scenarios available for this app. */ + scenarios: Scenario[]; + /** Run the scenario's verifier against this app's store. Returns undefined if the scenario has no verifier. */ + verify(store: AppStore, scenario: Scenario): VerificationResult | undefined; + /** + * Launch an MCP server for this app at the requested scale and return a + * handle. `variant` selects among fair-MCP variants (flat / flat+prompt / + * resources / prompts) — apps that only support `flat` may throw for the others. + */ + startMcpServer?(scale: DataScale, variant: string): Promise; + /** System prompt for MCP runs. Domain-specific and tuned per app. */ + mcpSystemPrompt?: string; +} + +export interface McpServerHandle { + client: Client; + stop(): Promise; + /** Rebuild enough state from MCP tool calls to run the scenario's verifier. */ + verify(scenario: Scenario): Promise; +} + +export interface AppStore { + /** Unknown-by-design — each app is responsible for its own store type. */ + readonly __brand: "app-store"; + readonly inner: unknown; +} + +export interface SlopServerHandle { + wsUrl: string; + stop(): Promise; +} + +const registry: Record = { + "issue-tracker": issueTrackerApp, + todo: todoApp, + "file-browser": fileBrowserApp, + crm: crmApp, +}; + +export function resolveApp(id: AppId): AppBinding { + const binding = registry[id]; + if (!binding) throw new Error(`App not yet implemented in v2: ${id}`); + return binding; +} diff --git a/benchmarks/v2/apps/todo/index.ts b/benchmarks/v2/apps/todo/index.ts new file mode 100644 index 0000000..6988c4e --- /dev/null +++ b/benchmarks/v2/apps/todo/index.ts @@ -0,0 +1,79 @@ +import { Client } from "@modelcontextprotocol/sdk/client"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; +import { TodoStore } from "./store.ts"; +import { seedTodo } from "./seed.ts"; +import { startTodoSlopServer, type TodoSlopOpts } from "./slop-server.ts"; +import { todoScenarios } from "./scenarios.ts"; +import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "../registry.ts"; +import type { DataScale } from "../../runner/types.ts"; +import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts"; + +function wrap(inner: TodoStore): AppStore & { inner: TodoStore } { + return { __brand: "app-store", inner } as AppStore & { inner: TodoStore }; +} + +export const todoApp: AppBinding = { + id: "todo", + supportedScales: ["s", "m", "l", "xl"], + createStore(scale, seed) { + const store = new TodoStore(); + store.reset(seedTodo(scale, seed)); + return wrap(store); + }, + async startSlopServer(store, port, opts): Promise { + const inner = (store as unknown as { inner: TodoStore }).inner; + const { server, slop } = startTodoSlopServer(inner, port, opts as TodoSlopOpts | undefined); + return { + wsUrl: `ws://localhost:${port}/slop`, + stop: async () => { + slop.stop(); + server.stop(); + }, + }; + }, + scenarios: todoScenarios, + verify(store, scenario) { + if (!scenario.verify) return undefined; + const inner = (store as unknown as { inner: TodoStore }).inner; + return scenario.verify(inner as unknown as Parameters>[0]); + }, + mcpSystemPrompt: + "You are a todo-list agent. You have tools to list tasks and mutate them. " + + "You have no prior knowledge of the data — discover it by listing tasks. " + + 'When the task is complete, respond with "DONE".', + async startMcpServer(scale: DataScale, _variant: string): Promise { + // Every variant we currently ship (flat, flat+prompt) uses the same + // underlying stdio MCP server — only the system prompt differs, and + // that's handled by the cell runner via resolveMcpVariant. If a future + // variant needs a different server (resources, prompts), dispatch here. + const env: Record = { ...process.env } as Record; + env.BENCH_SCALE = scale; + env.BENCH_SEED = String(42); + const serverPath = new URL("./mcp-server.ts", import.meta.url).pathname; + const transport = new StdioClientTransport({ + command: "bun", + args: ["run", serverPath], + env, + }); + const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" }); + await client.connect(transport); + return { + client, + stop: async () => { + await client.close(); + }, + verify: async (scenario: Scenario): Promise => { + if (!scenario.verify) return undefined; + // Reconstruct a TodoStore from one list_tasks call. No children to + // recurse, so this is a single round trip. + const res = await client.callTool({ name: "list_tasks", arguments: {} }); + const content = (res as { content?: Array<{ type: string; text?: string }> }).content ?? []; + const text = content.find((c) => c.type === "text")?.text ?? "[]"; + const tasks = JSON.parse(text); + const tempStore = new TodoStore(); + tempStore.reset(tasks); + return scenario.verify(tempStore as unknown as Parameters>[0]); + }, + }; + }, +}; diff --git a/benchmarks/v2/apps/todo/mcp-server.ts b/benchmarks/v2/apps/todo/mcp-server.ts new file mode 100644 index 0000000..a8bb3a0 --- /dev/null +++ b/benchmarks/v2/apps/todo/mcp-server.ts @@ -0,0 +1,174 @@ +/** + * Stdio MCP server for the todo benchmark app. Spawned as a child process by + * the MCP cell runner with env vars: + * - BENCH_SCALE = s | m | l | xl (required) + * - BENCH_SEED = integer (required) + * + * Both are read at startup; the agent sees whatever the seed produced. + */ + +import { Server } from "@modelcontextprotocol/sdk/server"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; +import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js"; +import { TodoStore, type Priority } from "./store.ts"; +import { seedTodo } from "./seed.ts"; +import type { DataScale } from "../../runner/types.ts"; + +const scale = (process.env.BENCH_SCALE as DataScale | undefined) ?? "s"; +const seed = Number(process.env.BENCH_SEED ?? 42); + +const store = new TodoStore(); +store.reset(seedTodo(scale, seed)); + +const server = new Server({ name: "todo-mcp", version: "0.2.0" }, { capabilities: { tools: {} } }); + +server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [ + { + name: "list_tasks", + description: "List every task in the todo app", + inputSchema: { type: "object" as const, properties: {} }, + }, + { + name: "get_task", + description: "Get a single task by id", + inputSchema: { + type: "object" as const, + properties: { id: { type: "string", description: "Task id" } }, + required: ["id"], + }, + }, + { + name: "mark_done", + description: "Mark a task as done (no-op if already done)", + inputSchema: { + type: "object" as const, + properties: { id: { type: "string", description: "Task id" } }, + required: ["id"], + }, + }, + { + name: "reopen_task", + description: "Mark a done task as not done (no-op if already undone)", + inputSchema: { + type: "object" as const, + properties: { id: { type: "string", description: "Task id" } }, + required: ["id"], + }, + }, + { + name: "set_priority", + description: "Set a task's priority (low, medium, high)", + inputSchema: { + type: "object" as const, + properties: { + id: { type: "string", description: "Task id" }, + priority: { type: "string", description: "low | medium | high" }, + }, + required: ["id", "priority"], + }, + }, + { + name: "set_tag", + description: "Set a task's tag. Empty string clears it.", + inputSchema: { + type: "object" as const, + properties: { + id: { type: "string", description: "Task id" }, + tag: { type: "string", description: "Tag name; empty string to clear" }, + }, + required: ["id", "tag"], + }, + }, + { + name: "edit_title", + description: "Rename a task", + inputSchema: { + type: "object" as const, + properties: { + id: { type: "string", description: "Task id" }, + title: { type: "string", description: "New title" }, + }, + required: ["id", "title"], + }, + }, + { + name: "delete_task", + description: "Delete a task permanently", + inputSchema: { + type: "object" as const, + properties: { id: { type: "string", description: "Task id" } }, + required: ["id"], + }, + }, + ], +})); + +server.setRequestHandler(CallToolRequestSchema, async (req) => { + const { name, arguments: args } = req.params; + const a = (args ?? {}) as Record; + try { + switch (name) { + case "list_tasks": + return json(store.tasks); + case "get_task": { + const t = store.get(String(a.id)); + return t ? json(t) : err(`task ${a.id} not found`); + } + case "mark_done": { + const t = store.get(String(a.id)); + if (!t) return err(`task ${a.id} not found`); + store.setDone(t.id, true); + return json({ id: t.id, done: true }); + } + case "reopen_task": { + const t = store.get(String(a.id)); + if (!t) return err(`task ${a.id} not found`); + store.setDone(t.id, false); + return json({ id: t.id, done: false }); + } + case "set_priority": { + const t = store.get(String(a.id)); + if (!t) return err(`task ${a.id} not found`); + const p = String(a.priority); + if (!["low", "medium", "high"].includes(p)) return err(`invalid priority ${p}`); + store.setPriority(t.id, p as Priority); + return json({ id: t.id, priority: p }); + } + case "set_tag": { + const t = store.get(String(a.id)); + if (!t) return err(`task ${a.id} not found`); + const tag = String(a.tag ?? ""); + store.setTag(t.id, tag === "" ? null : tag); + return json({ id: t.id, tag: tag === "" ? null : tag }); + } + case "edit_title": { + const t = store.get(String(a.id)); + if (!t) return err(`task ${a.id} not found`); + store.editTitle(t.id, String(a.title)); + return json({ id: t.id }); + } + case "delete_task": { + const t = store.get(String(a.id)); + if (!t) return err(`task ${a.id} not found`); + store.delete(t.id); + return json({ deleted: t.id }); + } + default: + return err(`unknown tool ${name}`); + } + } catch (e) { + return err(e instanceof Error ? e.message : String(e)); + } +}); + +function json(data: unknown) { + return { content: [{ type: "text", text: JSON.stringify(data) }] }; +} + +function err(msg: string) { + return { content: [{ type: "text", text: JSON.stringify({ error: msg }) }], isError: true }; +} + +const transport = new StdioServerTransport(); +await server.connect(transport); diff --git a/benchmarks/v2/apps/todo/scenarios.ts b/benchmarks/v2/apps/todo/scenarios.ts new file mode 100644 index 0000000..614621a --- /dev/null +++ b/benchmarks/v2/apps/todo/scenarios.ts @@ -0,0 +1,91 @@ +import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts"; +import type { TodoStore } from "./store.ts"; + +/** + * Todo scenarios are deliberately simple — they test the floor of the + * complexity ladder. If SLOP's advantages shrink here we want to see it. + * Each verifier is scale-independent: it checks predicates across whatever + * tasks the store was seeded with, not a fixed count. + */ + +// Scenarios conform to v1's Scenario type to reuse the AppBinding surface — +// the `steps` field is only exercised in scripted mode (not used in v2 yet) +// so we provide empty arrays. + +const empty: Scenario["steps"] = []; + +function verifyAllDone(store: TodoStore): VerificationResult { + const notDone = store.tasks.filter((t) => !t.done); + return { + passed: notDone.length === 0, + checks: [ + { + name: "all tasks are done", + passed: notDone.length === 0, + detail: notDone.length === 0 ? undefined : `${notDone.length} tasks still undone`, + }, + ], + }; +} + +function verifyOnlyUndoneRemain(store: TodoStore): VerificationResult { + const done = store.tasks.filter((t) => t.done); + return { + passed: done.length === 0, + checks: [ + { + name: "no done tasks remain", + passed: done.length === 0, + detail: done.length === 0 ? undefined : `${done.length} done tasks were not deleted`, + }, + ], + }; +} + +function verifyBugsHighPriority(store: TodoStore): VerificationResult { + const bugs = store.tasks.filter((t) => t.tag === "bug"); + const nonHigh = bugs.filter((t) => t.priority !== "high"); + // Non-bug tasks are seeded with priority <= medium, so any non-bug that's + // now high means the agent touched a task it shouldn't have. + const nonBugsPromoted = store.tasks.filter((t) => t.tag !== "bug" && t.priority === "high"); + return { + passed: nonHigh.length === 0 && nonBugsPromoted.length === 0, + checks: [ + { + name: "every bug tagged task is priority=high", + passed: nonHigh.length === 0, + detail: nonHigh.length === 0 ? undefined : `${nonHigh.length} bug tasks not high priority`, + }, + { + name: "no non-bug tasks elevated to high", + passed: nonBugsPromoted.length === 0, + detail: nonBugsPromoted.length === 0 ? undefined : `${nonBugsPromoted.length} non-bug tasks incorrectly promoted`, + }, + ], + }; +} + +export const todoScenarios: Scenario[] = [ + { + name: "mark-all-done", + description: "Mark every task as done, touching each task exactly once.", + agentPrompt: "Mark every task as done. Don't skip any. Don't delete or modify anything else.", + steps: empty, + verify: (store) => verifyAllDone(store as unknown as TodoStore), + }, + { + name: "delete-completed", + description: "Delete every already-done task, leaving only undone tasks.", + agentPrompt: + "Delete every task that is already marked done. Leave the undone tasks untouched.", + steps: empty, + verify: (store) => verifyOnlyUndoneRemain(store as unknown as TodoStore), + }, + { + name: "prioritize-bugs", + description: "Set priority=high on every task tagged as 'bug'.", + agentPrompt: "For every task tagged as 'bug', set its priority to high. Don't touch any other tasks.", + steps: empty, + verify: (store) => verifyBugsHighPriority(store as unknown as TodoStore), + }, +]; diff --git a/benchmarks/v2/apps/todo/seed.ts b/benchmarks/v2/apps/todo/seed.ts new file mode 100644 index 0000000..548ae1a --- /dev/null +++ b/benchmarks/v2/apps/todo/seed.ts @@ -0,0 +1,63 @@ +import type { DataScale } from "../../runner/types.ts"; +import type { Priority, Task } from "./store.ts"; + +const PRIORITIES: Priority[] = ["low", "medium", "high"]; +const TAGS = ["bug", "meeting", "errand", "read", "chore", "work", "personal"]; +const TITLES = [ + "Fix login redirect loop", + "Review sprint metrics", + "Pick up groceries", + "Read new pricing RFC", + "Update quarterly OKRs", + "Call dentist", + "Refactor data loader", + "Write postmortem", + "Prep 1:1 agenda", + "Cancel old subscription", +]; + +const SIZES: Record = { s: 8, m: 30, l: 100, xl: 500 }; +const BUG_SHARE = 0.2; // ~20% of tasks tagged as bug +const DONE_SHARE = 0.25; + +/** + * Deterministic seeded PRNG — xorshift32. Two runs with the same (scale, seed) + * produce byte-identical tasks. This is what lets the sweep reproduce cells. + */ +function makeRng(seed: number) { + let x = seed || 0x1234567; + return () => { + x ^= x << 13; + x ^= x >>> 17; + x ^= x << 5; + return ((x >>> 0) % 1_000_000) / 1_000_000; + }; +} + +export function seedTodo(scale: DataScale, seed: number): Task[] { + const rng = makeRng(seed); + const count = SIZES[scale]; + const out: Task[] = []; + for (let i = 0; i < count; i++) { + const title = `${TITLES[i % TITLES.length]} #${i + 1}`; + const isBug = rng() < BUG_SHARE; + // Non-bug tasks are capped at medium so `prioritize-bugs` can check + // "no non-bug is high" without needing a pre-state snapshot. Bugs start + // at anything and the agent is asked to raise them to high. + const otherTags = TAGS.filter((t) => t !== "bug"); + const tag = isBug ? "bug" : rng() < 0.7 ? otherTags[Math.floor(rng() * otherTags.length)] : null; + const pri = isBug + ? PRIORITIES[Math.floor(rng() * PRIORITIES.length)] + : (["low", "medium"] as Priority[])[Math.floor(rng() * 2)]; + const done = rng() < DONE_SHARE; + out.push({ + id: `task-${i + 1}`, + title, + priority: pri, + tag, + done, + createdAt: 1_700_000_000_000 + i * 1000, + }); + } + return out; +} diff --git a/benchmarks/v2/apps/todo/slop-server.ts b/benchmarks/v2/apps/todo/slop-server.ts new file mode 100644 index 0000000..1a87b0a --- /dev/null +++ b/benchmarks/v2/apps/todo/slop-server.ts @@ -0,0 +1,184 @@ +import { SlopServer } from "@slop-ai/server"; +import { bunHandler } from "@slop-ai/server/bun"; +import type { NodeDescriptor } from "@slop-ai/core"; +import type { TodoStore, Priority, Task } from "./store.ts"; + +export interface TodoSlopOpts { + maxNodes?: number; + maxDepth?: number; + /** + * optimized=true: windows the tasks collection to undone tasks + a rich + * summary, assigns salience (undone > done, high > low), and pushes done + * tasks behind the default window. off=false dumps every task inline. + */ + optimized?: boolean; +} + +export function createTodoSlopServer(store: TodoStore, opts?: TodoSlopOpts) { + const slop = new SlopServer({ + id: "todo", + name: "Todo", + ...(opts?.maxNodes != null && { maxNodes: opts.maxNodes }), + ...(opts?.maxDepth != null && { maxDepth: opts.maxDepth }), + }); + + const optimized = opts?.optimized ?? false; + + slop.register("overview", () => { + const done = store.tasks.filter((t) => t.done).length; + const undone = store.tasks.length - done; + const bugs = store.tasks.filter((t) => t.tag === "bug").length; + return { + type: "context", + props: { + total: store.tasks.length, + done, + undone, + bugs, + }, + summary: `${store.tasks.length} tasks (${undone} undone, ${done} done, ${bugs} tagged bug)`, + }; + }); + + slop.register("tasks", () => { + const all = store.tasks; + if (optimized) { + const done = all.filter((t) => t.done).length; + const undone = all.length - done; + return { + type: "collection", + props: { count: all.length }, + summary: `${all.length} tasks: ${undone} undone, ${done} done.`, + children: Object.fromEntries( + [...all] + .map((task) => ({ task, salience: salienceFor(task) })) + .sort((a, b) => b.salience - a.salience) + .map(({ task, salience }) => [task.id, buildTaskNode(store, slop, task, salience)]), + ), + } satisfies NodeDescriptor; + } + return { + type: "collection", + props: { count: all.length }, + children: Object.fromEntries(all.map((task) => [task.id, buildTaskNode(store, slop, task)])), + } satisfies NodeDescriptor; + }); + + return slop; +} + +function salienceFor(t: Task): number { + let score = t.done ? 0.1 : 0.5; + if (!t.done && t.tag === "bug") score += 0.3; + if (t.priority === "high") score += 0.2; + return Math.min(1, score); +} + +function buildTaskNode( + store: TodoStore, + slop: SlopServer, + task: Task, + salience?: number, +): NodeDescriptor { + const actions: NonNullable = { + edit_title: { + label: "Edit title", + description: "Rename this task", + params: { title: { type: "string", description: "New title" } }, + handler: async (params) => { + store.editTitle(task.id, params.title as string); + slop.refresh(); + return { id: task.id }; + }, + }, + set_priority: { + label: "Set priority", + description: "Set task priority (low, medium, high)", + params: { + priority: { type: "string", description: "low | medium | high" }, + }, + handler: async (params) => { + store.setPriority(task.id, params.priority as Priority); + slop.refresh(); + return { id: task.id }; + }, + }, + set_tag: { + label: "Set tag", + description: "Assign a tag to this task (empty string clears it)", + params: { tag: { type: "string", description: "Tag name, or empty string to clear" } }, + handler: async (params) => { + const t = String(params.tag ?? ""); + store.setTag(task.id, t === "" ? null : t); + slop.refresh(); + return { id: task.id }; + }, + }, + delete: { + label: "Delete task", + description: "Delete this task permanently", + params: {}, + handler: async () => { + store.delete(task.id); + slop.refresh(); + return { deleted: task.id }; + }, + }, + }; + + // State-dependent affordance: only expose `mark_done` when not done, and + // `reopen` when done — this is a key SLOP pitch so we exercise it. + if (task.done) { + actions.reopen = { + label: "Reopen", + description: "Mark this task as not done", + params: {}, + handler: async () => { + store.setDone(task.id, false); + slop.refresh(); + return { id: task.id }; + }, + }; + } else { + actions.mark_done = { + label: "Mark done", + description: "Mark this task as done", + params: {}, + handler: async () => { + store.setDone(task.id, true); + slop.refresh(); + return { id: task.id }; + }, + }; + } + + const node: NodeDescriptor = { + type: "task", + props: { + title: task.title, + done: task.done, + priority: task.priority, + tag: task.tag ?? "", + }, + actions, + }; + if (salience !== undefined) { + node.meta = { salience }; + } + return node; +} + +export function startTodoSlopServer(store: TodoStore, port: number, opts?: TodoSlopOpts) { + const slop = createTodoSlopServer(store, opts); + const handler = bunHandler(slop, { path: "/slop" }); + const server = Bun.serve({ + port, + fetch(req, srv) { + const resp = handler.fetch(req, srv); + if (resp) return resp; + return new Response("SLOP Todo benchmark server", { status: 200 }); + }, + websocket: handler.websocket, + }); + return { server, slop }; +} diff --git a/benchmarks/v2/apps/todo/store.ts b/benchmarks/v2/apps/todo/store.ts new file mode 100644 index 0000000..93af342 --- /dev/null +++ b/benchmarks/v2/apps/todo/store.ts @@ -0,0 +1,69 @@ +export type Priority = "low" | "medium" | "high"; + +export interface Task { + id: string; + title: string; + done: boolean; + priority: Priority; + tag: string | null; + createdAt: number; +} + +export class TodoStore { + tasks: Task[] = []; + + reset(tasks: Task[]) { + this.tasks = tasks.map((t) => ({ ...t })); + } + + get(id: string): Task | undefined { + return this.tasks.find((t) => t.id === id); + } + + mustGet(id: string): Task { + const t = this.get(id); + if (!t) throw new Error(`Task ${id} not found`); + return t; + } + + add(task: Omit & { id?: string }): Task { + const id = task.id ?? `task-${this.tasks.length + 1}`; + const t: Task = { id, createdAt: Date.now(), ...task }; + this.tasks.push(t); + return t; + } + + toggleDone(id: string): Task { + const t = this.mustGet(id); + t.done = !t.done; + return t; + } + + setDone(id: string, done: boolean): Task { + const t = this.mustGet(id); + t.done = done; + return t; + } + + setPriority(id: string, priority: Priority): Task { + const t = this.mustGet(id); + t.priority = priority; + return t; + } + + setTag(id: string, tag: string | null): Task { + const t = this.mustGet(id); + t.tag = tag; + return t; + } + + editTitle(id: string, title: string): Task { + const t = this.mustGet(id); + t.title = title; + return t; + } + + delete(id: string): void { + this.tasks = this.tasks.filter((t) => t.id !== id); + } +} diff --git a/benchmarks/v2/config/ablation.ts b/benchmarks/v2/config/ablation.ts new file mode 100644 index 0000000..fe457b3 --- /dev/null +++ b/benchmarks/v2/config/ablation.ts @@ -0,0 +1,31 @@ +import type { SweepConfig } from "../runner/types.ts"; + +const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1"; + +/** + * First real ablation: prompts × encodings × protocols on the todo app's + * fastest scenario at small scale. Goal is to light up every registry entry + * at least once and let the dashboard pivot across dimensions. + * + * Cell math: 3 prompts × 3 encodings × 1 optimization = 9 SLOP cells, + * plus 2 MCP variants = 11 cells × 1 iteration. + */ +export const ablationSweep: SweepConfig = { + id: "ablation-prompts-encodings", + providers: [ + { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" }, + ], + promptVariants: ["minimal", "spec", "spec-terse"], + encodingVariants: ["indented-text", "json-compact", "markdown-headings"], + optimizationVariants: ["off"], + protocols: ["slop", "mcp"], + mcpVariants: ["flat", "flat+prompt"], + apps: ["todo"], + dataScales: ["s"], + scenarioFilter: ["mark-all-done"], + seeds: [42], + iterations: 1, + maxConcurrency: 1, + maxTurns: 30, + temperature: 0, +}; diff --git a/benchmarks/v2/config/smoke-crm.ts b/benchmarks/v2/config/smoke-crm.ts new file mode 100644 index 0000000..c638fc4 --- /dev/null +++ b/benchmarks/v2/config/smoke-crm.ts @@ -0,0 +1,28 @@ +import type { SweepConfig } from "../runner/types.ts"; + +const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1"; + +/** + * Validation sweep for the crm app. Runs one easy scenario (high-value-alert) + * on both SLOP and MCP at scale=s so we can see the top-of-ladder end-to-end + * without blowing out token budgets. + */ +export const smokeCrmSweep: SweepConfig = { + id: "smoke-crm", + providers: [ + { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" }, + ], + promptVariants: ["spec"], + encodingVariants: ["indented-text"], + optimizationVariants: ["off"], + protocols: ["slop", "mcp"], + mcpVariants: ["flat"], + apps: ["crm"], + dataScales: ["s"], + scenarioFilter: ["high-value-alert"], + seeds: [42], + iterations: 1, + maxConcurrency: 1, + maxTurns: 40, + temperature: 0, +}; diff --git a/benchmarks/v2/config/smoke-file-browser.ts b/benchmarks/v2/config/smoke-file-browser.ts new file mode 100644 index 0000000..a96e143 --- /dev/null +++ b/benchmarks/v2/config/smoke-file-browser.ts @@ -0,0 +1,28 @@ +import type { SweepConfig } from "../runner/types.ts"; + +const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1"; + +/** + * Validation sweep for the file-browser app. delete-empty-dirs exercises the + * state-dependent affordance (delete only available on empty dirs) on SLOP, + * which MCP has no equivalent of. + */ +export const smokeFileBrowserSweep: SweepConfig = { + id: "smoke-file-browser", + providers: [ + { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" }, + ], + promptVariants: ["spec"], + encodingVariants: ["indented-text"], + optimizationVariants: ["off"], + protocols: ["slop", "mcp"], + mcpVariants: ["flat"], + apps: ["file-browser"], + dataScales: ["s"], + scenarioFilter: ["delete-empty-dirs"], + seeds: [42], + iterations: 1, + maxConcurrency: 1, + maxTurns: 30, + temperature: 0, +}; diff --git a/benchmarks/v2/config/smoke-mcp.ts b/benchmarks/v2/config/smoke-mcp.ts new file mode 100644 index 0000000..da1253c --- /dev/null +++ b/benchmarks/v2/config/smoke-mcp.ts @@ -0,0 +1,32 @@ +import type { SweepConfig } from "../runner/types.ts"; + +const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1"; + +/** + * SLOP-vs-MCP head-to-head smoke. One fast scenario, one model, one iter per + * cell × 2 protocols = 2 cells. Validates that the MCP cell runner works + * end-to-end and that verification via reconstruction passes. + */ +export const smokeMcpSweep: SweepConfig = { + id: "smoke-mcp", + providers: [ + { + kind: "openai-compat", + baseUrl: DGX_URL, + model: "gemma4:31b", + }, + ], + promptVariants: ["spec"], + encodingVariants: ["indented-text"], + optimizationVariants: ["off"], + protocols: ["slop", "mcp"], + mcpVariants: ["flat"], + apps: ["issue-tracker"], + dataScales: ["s"], + scenarioFilter: ["explore-and-act"], + seeds: [42], + iterations: 1, + maxConcurrency: 1, + maxTurns: 20, + temperature: 0, +}; diff --git a/benchmarks/v2/config/smoke-todo.ts b/benchmarks/v2/config/smoke-todo.ts new file mode 100644 index 0000000..7875e55 --- /dev/null +++ b/benchmarks/v2/config/smoke-todo.ts @@ -0,0 +1,27 @@ +import type { SweepConfig } from "../runner/types.ts"; + +const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1"; + +/** + * Validation sweep for the todo app. Small scale, one scenario, SLOP vs MCP. + * Confirms the new app binding works end-to-end. + */ +export const smokeTodoSweep: SweepConfig = { + id: "smoke-todo", + providers: [ + { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" }, + ], + promptVariants: ["spec"], + encodingVariants: ["indented-text"], + optimizationVariants: ["off"], + protocols: ["slop", "mcp"], + mcpVariants: ["flat"], + apps: ["todo"], + dataScales: ["s"], + scenarioFilter: ["mark-all-done"], + seeds: [42], + iterations: 1, + maxConcurrency: 1, + maxTurns: 30, + temperature: 0, +}; diff --git a/benchmarks/v2/config/smoke.ts b/benchmarks/v2/config/smoke.ts new file mode 100644 index 0000000..4875301 --- /dev/null +++ b/benchmarks/v2/config/smoke.ts @@ -0,0 +1,31 @@ +import type { SweepConfig } from "../runner/types.ts"; + +const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1"; + +/** + * Smoke sweep — smallest useful cross-section. Runs on one model, one prompt, + * one encoding, two optimization levels, two scenarios, small data, 1 iter. + * Target wall time: a couple of minutes on DGX gemma4:31b. + */ +export const smokeSweep: SweepConfig = { + id: "smoke", + providers: [ + { + kind: "openai-compat", + baseUrl: DGX_URL, + model: "gemma4:31b", + }, + ], + promptVariants: ["spec"], + encodingVariants: ["indented-text"], + optimizationVariants: ["off", "combined"], + protocols: ["slop"], + apps: ["issue-tracker"], + dataScales: ["s"], + scenarioFilter: ["explore-and-act"], + seeds: [42], + iterations: 3, + maxConcurrency: 1, + maxTurns: 20, + temperature: 0, +}; diff --git a/benchmarks/v2/dashboard/app.js b/benchmarks/v2/dashboard/app.js new file mode 100644 index 0000000..6f88e0e --- /dev/null +++ b/benchmarks/v2/dashboard/app.js @@ -0,0 +1,433 @@ +// SLOP benchmarks v2 — dashboard client. Aggregates runs.jsonl in the browser +// so the user can pivot on any two axes without regenerating data. Mirrors +// the math in benchmarks/v2/metrics/stats.ts (kept deliberately tiny). + +const AXES = [ + { id: "app", label: "app", pick: (c) => c.cell.app }, + { id: "scale", label: "scale", pick: (c) => c.cell.scale }, + { id: "scenario", label: "scenario", pick: (c) => c.cell.scenario }, + { id: "variant", label: "variant", pick: (c) => variantLabel(c.cell) }, + { id: "model", label: "model", pick: (c) => `${c.cell.provider.kind}:${c.cell.provider.model}` }, + { id: "protocol", label: "protocol", pick: (c) => c.cell.protocol }, + { id: "optimization", label: "optimization", pick: (c) => c.cell.optimization }, +]; + +const METRICS = [ + { id: "passRate", label: "pass rate", format: (v) => `${(v * 100).toFixed(0)}%`, pick: (a) => a.passRate }, + { id: "totalTokens", label: "tokens (mean)", format: fmtInt, pick: (a) => a.totalTokens.mean }, + { id: "tokensPerSuccess", label: "tokens per success", format: fmtInt, pick: (a) => a.tokensPerSuccess }, + { id: "maxContextTokens", label: "max context tokens", format: fmtInt, pick: (a) => a.maxContextTokens.mean }, + { id: "turns", label: "turns (mean)", format: (v) => v.toFixed(1), pick: (a) => a.turns.mean }, + { id: "toolCalls", label: "tool calls (mean)", format: (v) => v.toFixed(1), pick: (a) => a.toolCalls.mean }, + { id: "specComplianceRate", label: "spec compliance", format: (v) => `${(v * 100).toFixed(0)}%`, pick: (a) => a.specComplianceRate.mean }, + { id: "totalTimeS", label: "wall time (s)", format: (v) => v.toFixed(1), pick: (a) => a.totalTimeMs.mean / 1000 }, + { id: "llmTimeS", label: "llm time (s)", format: (v) => v.toFixed(1), pick: (a) => a.llmTimeMs.mean / 1000 }, + { id: "costPerSuccess", label: "$ per success", format: fmtCost, pick: (a) => a.costPerSuccess }, +]; + +const state = { + runs: [], + cellAggregates: [], + rowAxis: "variant", + colAxis: "scenario", + metric: "totalTokens", + filters: { app: "", scenario: "", scale: "" }, +}; + +init(); + +async function init() { + populateSelect("rowAxis", AXES.map((a) => ({ value: a.id, label: a.label }))); + populateSelect("colAxis", AXES.map((a) => ({ value: a.id, label: a.label }))); + populateSelect("metric", METRICS.map((m) => ({ value: m.id, label: m.label }))); + document.getElementById("rowAxis").value = state.rowAxis; + document.getElementById("colAxis").value = state.colAxis; + document.getElementById("metric").value = state.metric; + + document.getElementById("rowAxis").addEventListener("change", (e) => { + state.rowAxis = e.target.value; + render(); + }); + document.getElementById("colAxis").addEventListener("change", (e) => { + state.colAxis = e.target.value; + render(); + }); + document.getElementById("metric").addEventListener("change", (e) => { + state.metric = e.target.value; + render(); + }); + for (const key of ["filterApp", "filterScenario", "filterScale"]) { + document.getElementById(key).addEventListener("change", (e) => { + const filterKey = key.replace("filter", "").toLowerCase(); + state.filters[filterKey] = e.target.value; + render(); + }); + } + document.getElementById("fileInput").addEventListener("change", onFileSelected); + document.getElementById("sweep").addEventListener("change", (e) => { + if (e.target.value) loadSweepByName(e.target.value); + }); + + // Try to auto-discover sweeps (when served from bun) + try { + const res = await fetch("/sweeps"); + if (res.ok) { + const list = await res.json(); + const sel = document.getElementById("sweep"); + sel.innerHTML = ''; + for (const name of list) { + const opt = document.createElement("option"); + opt.value = name; + opt.textContent = name; + sel.appendChild(opt); + } + const qsSweep = new URL(location.href).searchParams.get("sweep"); + if (qsSweep && list.includes(qsSweep)) { + sel.value = qsSweep; + loadSweepByName(qsSweep); + } else if (list.length > 0) { + sel.value = list[list.length - 1]; + loadSweepByName(sel.value); + } + } + } catch { + // Not served from the bun dashboard server — user must pick a file manually. + } +} + +async function loadSweepByName(name) { + setStatus(`loading ${name}…`); + try { + const res = await fetch(`/results/${name}/runs.jsonl`); + if (!res.ok) throw new Error(`HTTP ${res.status}`); + const text = await res.text(); + loadRunsText(text, name); + } catch (err) { + setStatus(`failed to load: ${err.message}`); + } +} + +function onFileSelected(e) { + const file = e.target.files?.[0]; + if (!file) return; + const reader = new FileReader(); + reader.onload = () => loadRunsText(String(reader.result), file.name); + reader.readAsText(file); +} + +function loadRunsText(text, source) { + const lines = text.split("\n").filter((l) => l.trim().length > 0); + const runs = []; + let sweepConfig = null; + for (const line of lines) { + let obj; + try { + obj = JSON.parse(line); + } catch { + continue; + } + if (obj.type === "sweep") sweepConfig = obj.config; + else if (obj.cellId) runs.push(obj); + } + state.runs = runs; + state.cellAggregates = aggregateCells(runs); + setStatus(`loaded ${source}: ${runs.length} runs, ${state.cellAggregates.length} cells`); + + // Populate filter selects from data + populateFilter("filterApp", runs.map((r) => r.cell.app)); + populateFilter("filterScenario", runs.map((r) => r.cell.scenario)); + populateFilter("filterScale", runs.map((r) => r.cell.scale)); + render(); +} + +function render() { + const host = document.getElementById("table-host"); + const cells = filterCells(state.cellAggregates, state.filters); + if (cells.length === 0) { + host.innerHTML = '
no cells match the current filters
'; + return; + } + const rowAxis = AXES.find((a) => a.id === state.rowAxis); + const colAxis = AXES.find((a) => a.id === state.colAxis); + const metric = METRICS.find((m) => m.id === state.metric); + + const rowValues = unique(cells.map((c) => rowAxis.pick(c))); + const colValues = unique(cells.map((c) => colAxis.pick(c))); + + const grid = new Map(); + for (const c of cells) { + const key = `${rowAxis.pick(c)}__${colAxis.pick(c)}`; + const list = grid.get(key) ?? []; + list.push(c); + grid.set(key, list); + } + + let html = ""; + for (const col of colValues) html += ``; + html += ""; + for (const row of rowValues) { + html += ``; + for (const col of colValues) { + const list = grid.get(`${row}__${col}`) ?? []; + if (list.length === 0) { + html += ""; + } else { + const mergedPass = mergePassRate(list); + const val = mergeMetric(list, metric); + const cellId = list.map((c) => c.cellId).join(","); + const sample = list.reduce((a, c) => a + c.runs, 0); + const passClass = mergedPass === 1 ? "" : mergedPass >= 0.5 ? "warn" : "bad"; + html += `"; + } + } + html += ""; + } + html += "
${escapeHtml(col)}
${escapeHtml(row)}` + + `
${escapeHtml(metric.format(val))}
` + + `
n=${sample}${list[0].runs > 1 ? ` ± ${fmtInt(stdevOf(list, metric))}` : ""}
` + + `
` + + "
"; + host.innerHTML = html; + + host.querySelectorAll("td.cell").forEach((td) => { + td.addEventListener("click", () => openCellDetail(td.dataset.cells)); + }); +} + +function mergePassRate(aggs) { + const totalRuns = aggs.reduce((a, c) => a + c.runs, 0); + if (totalRuns === 0) return 0; + const totalPass = aggs.reduce((a, c) => a + c.passRate * c.runs, 0); + return totalPass / totalRuns; +} + +function mergeMetric(aggs, metric) { + // Weighted mean across all aggregates matching the pivot cell. + const totalRuns = aggs.reduce((a, c) => a + c.runs, 0); + if (totalRuns === 0) return 0; + let sum = 0; + for (const c of aggs) { + const v = metric.pick(c); + if (!isFinite(v)) continue; + sum += v * c.runs; + } + return sum / totalRuns; +} + +function stdevOf(aggs, metric) { + // Pooled approximate stdev — good enough for a dashboard hover. + let n = 0; + let ssq = 0; + let mean = 0; + for (const c of aggs) { + const nv = c.runs; + const mv = metric.pick(c); + if (!isFinite(mv)) continue; + const delta = mv - mean; + n += nv; + mean += (delta * nv) / n; + ssq += nv * delta * (mv - mean); + } + if (n < 2) return 0; + return Math.sqrt(ssq / (n - 1)); +} + +function openCellDetail(cellIds) { + const ids = cellIds.split(","); + const aggs = state.cellAggregates.filter((c) => ids.includes(c.cellId)); + if (aggs.length === 0) return; + const runs = state.runs.filter((r) => ids.includes(r.cellId)); + const title = `${aggs.map((a) => variantLabel(a.cell)).join(" / ")}`; + document.getElementById("modal-title").textContent = title; + const body = document.getElementById("modal-body"); + + let html = ""; + for (const a of aggs) { + const cats = a.failureCategories; + const total = Object.values(cats).reduce((s, v) => s + v, 0); + html += `

${escapeHtml(variantLabel(a.cell))} × ${escapeHtml(a.cell.scenario)}

`; + html += "
"; + html += `
cellId
${escapeHtml(a.cellId)}
`; + html += `
runs
${a.runs}
`; + html += `
pass rate
${(a.passRate * 100).toFixed(0)}%
`; + html += `
spec compliance
${(a.specComplianceRate.mean * 100).toFixed(0)}%
`; + html += `
tokens
${fmtInt(a.totalTokens.mean)} (p95 ${fmtInt(a.totalTokens.p95)}, σ ${fmtInt(a.totalTokens.stdev)})
`; + html += `
max context
${fmtInt(a.maxContextTokens.mean)}
`; + html += `
turns
${a.turns.mean.toFixed(1)}
`; + html += `
tool calls
${a.toolCalls.mean.toFixed(1)}
`; + html += `
wall time
${(a.totalTimeMs.mean / 1000).toFixed(1)}s
`; + html += `
llm time
${(a.llmTimeMs.mean / 1000).toFixed(1)}s
`; + html += `
$ per success
${fmtCost(a.costPerSuccess)}
`; + html += "
"; + if (total > 0) { + html += "
"; + for (const [key, val] of Object.entries(cats)) { + if (val === 0) continue; + html += `
`; + } + html += "
"; + html += `
${Object.entries(cats).filter(([, v]) => v > 0).map(([k, v]) => `${k}=${v}`).join(" / ")}
`; + } + } + + // Per-run table + html += "

per-run detail

"; + html += "
";
+  html += `${"iter".padEnd(5)}${"turns".padStart(7)}${"calls".padStart(7)}${"spec%".padStart(7)}${"tok".padStart(9)}${"ctxMx".padStart(8)}${"t(s)".padStart(8)} verify\n`;
+  for (const r of runs) {
+    const m = r.metrics;
+    if (!m) {
+      html += `${String(r.cell.iteration).padEnd(5)} ERROR: ${escapeHtml((r.error ?? "").split("\n")[0])}\n`;
+      continue;
+    }
+    const v = m.verification ? `${m.verification.passedChecks}/${m.verification.totalChecks}` : "—";
+    html += `${String(r.cell.iteration).padEnd(5)}${String(m.turns).padStart(7)}${String(m.toolCalls).padStart(7)}${`${(m.specComplianceRate * 100).toFixed(0)}%`.padStart(7)}${fmtInt(m.totalTokens).padStart(9)}${fmtInt(m.maxContextTokens).padStart(8)}${(m.totalTimeMs / 1000).toFixed(1).padStart(8)} ${v}\n`;
+  }
+  html += "
"; + + body.innerHTML = html; + document.getElementById("modal").showModal(); +} + +function aggregateCells(runs) { + const buckets = new Map(); + for (const r of runs) { + const list = buckets.get(r.cellId) ?? []; + list.push(r); + buckets.set(r.cellId, list); + } + const out = []; + for (const [cellId, bucket] of buckets) { + const metrics = bucket.map((r) => r.metrics).filter((m) => m); + if (metrics.length === 0) continue; + const first = bucket[0]; + const passCount = bucket.filter((r) => r.metrics?.verification?.passed === true).length; + const passRate = passCount / bucket.length; + const agg = (pick) => numericAgg(metrics.map(pick)); + const pricing = { "gemma4:31b": [0, 0], "gemma4:e4b-it": [0, 0], "nemotron-3-super:120b": [0, 0] }; + const price = pricing[first.cell.provider.model] ?? [0, 0]; + const costMean = metrics.reduce((s, m) => s + (m.inputTokens * price[0] + m.outputTokens * price[1]) / 1_000_000, 0) / metrics.length; + const costAgg = { count: metrics.length, mean: costMean, median: costMean, p95: costMean, stdev: 0, min: costMean, max: costMean }; + out.push({ + cellId, + cell: first.cell, + runs: bucket.length, + passRate, + failureCategories: countCategories(bucket), + totalTokens: agg((m) => m.totalTokens), + inputTokens: agg((m) => m.inputTokens), + outputTokens: agg((m) => m.outputTokens), + maxContextTokens: agg((m) => m.maxContextTokens), + turns: agg((m) => m.turns), + toolCalls: agg((m) => m.toolCalls), + specComplianceRate: agg((m) => m.specComplianceRate), + llmTimeMs: agg((m) => m.llmTimeMs), + totalTimeMs: agg((m) => m.totalTimeMs), + transportBytes: agg((m) => m.transportBytesSent + m.transportBytesReceived), + costUsd: costAgg, + costPerSuccess: passCount > 0 ? (costMean * bucket.length) / passCount : Infinity, + tokensPerSuccess: passCount > 0 ? (agg((m) => m.totalTokens).mean * bucket.length) / passCount : Infinity, + }); + } + return out; +} + +function countCategories(runs) { + const counts = { ok: 0, no_verifier: 0, verify_fail: 0, max_turns: 0, tool_unknown: 0, tool_invoke_error: 0, tool_param_error: 0, cell_exception: 0 }; + for (const r of runs) { + if (r.error) { counts.cell_exception += 1; continue; } + const m = r.metrics; + if (!m) continue; + if (m.unknownToolCalls > 0) counts.tool_unknown += 1; + if (m.invokeErrorCalls > 0) counts.tool_invoke_error += 1; + if (m.paramErrorCalls > 0) counts.tool_param_error += 1; + if (m.finishReason === "max_turns") counts.max_turns += 1; + if (!m.verification) { counts.no_verifier += 1; continue; } + if (m.verification.passed) counts.ok += 1; + else counts.verify_fail += 1; + } + return counts; +} + +function numericAgg(values) { + const vs = values.filter((v) => Number.isFinite(v)); + if (vs.length === 0) return { count: 0, mean: 0, median: 0, p95: 0, stdev: 0, min: 0, max: 0 }; + const sorted = [...vs].sort((a, b) => a - b); + const mean = vs.reduce((a, b) => a + b, 0) / vs.length; + const median = percentile(sorted, 0.5); + const p95 = percentile(sorted, 0.95); + const stdev = vs.length > 1 ? Math.sqrt(vs.reduce((acc, v) => acc + (v - mean) ** 2, 0) / (vs.length - 1)) : 0; + return { count: vs.length, mean, median, p95, stdev, min: sorted[0], max: sorted[sorted.length - 1] }; +} + +function percentile(sorted, q) { + if (sorted.length === 0) return 0; + if (sorted.length === 1) return sorted[0]; + const idx = q * (sorted.length - 1); + const lo = Math.floor(idx); + const hi = Math.ceil(idx); + if (lo === hi) return sorted[lo]; + return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo); +} + +function variantLabel(cell) { + if (cell.protocol === "mcp") return `mcp:${cell.mcpVariant ?? "flat"}`; + return `slop:${cell.prompt}/${cell.encoding}/${cell.optimization}`; +} + +function filterCells(cells, filters) { + return cells.filter((c) => { + if (filters.app && c.cell.app !== filters.app) return false; + if (filters.scenario && c.cell.scenario !== filters.scenario) return false; + if (filters.scale && c.cell.scale !== filters.scale) return false; + return true; + }); +} + +function unique(arr) { + return Array.from(new Set(arr)); +} + +function populateSelect(id, items) { + const sel = document.getElementById(id); + sel.innerHTML = ""; + for (const item of items) { + const opt = document.createElement("option"); + opt.value = item.value; + opt.textContent = item.label; + sel.appendChild(opt); + } +} + +function populateFilter(id, values) { + const sel = document.getElementById(id); + const current = sel.value; + sel.innerHTML = ''; + for (const v of unique(values)) { + const opt = document.createElement("option"); + opt.value = v; + opt.textContent = v; + sel.appendChild(opt); + } + if (unique(values).includes(current)) sel.value = current; +} + +function fmtInt(v) { + if (!Number.isFinite(v)) return "∞"; + return Math.round(v).toLocaleString(); +} + +function fmtCost(v) { + if (!Number.isFinite(v)) return "∞"; + if (v === 0) return "$0"; + if (v < 0.01) return `$${v.toFixed(4)}`; + return `$${v.toFixed(3)}`; +} + +function escapeHtml(s) { + return String(s).replace(/[&<>"']/g, (c) => ({ "&": "&", "<": "<", ">": ">", '"': """, "'": "'" }[c])); +} + +function setStatus(msg) { + document.getElementById("status").textContent = msg; +} diff --git a/benchmarks/v2/dashboard/index.html b/benchmarks/v2/dashboard/index.html new file mode 100644 index 0000000..b2abdb1 --- /dev/null +++ b/benchmarks/v2/dashboard/index.html @@ -0,0 +1,131 @@ + + + + + SLOP benchmarks v2 — dashboard + + + + +
+

SLOP benchmarks v2

+ pick a sweep to load +
+ +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ +
+
+
+ pass rate 100% + pass rate 50–99% + pass rate <50% + cells are clickable — per-run drill-down +
+
+ + + + + + + + + diff --git a/benchmarks/v2/dashboard/serve.ts b/benchmarks/v2/dashboard/serve.ts new file mode 100644 index 0000000..402dcf7 --- /dev/null +++ b/benchmarks/v2/dashboard/serve.ts @@ -0,0 +1,71 @@ +/** + * Tiny bun server for the benchmarks dashboard. Serves the static dashboard + * files from this directory and exposes /results//runs.jsonl plus a + * /sweeps endpoint that lists every sweep id with a runs.jsonl on disk. + * + * Run with: `bun run dashboard/serve.ts` (or `bun run dash` from v2/). + */ +import { existsSync, readFileSync, readdirSync, statSync } from "node:fs"; +import { join, resolve } from "node:path"; + +const DASH_DIR = new URL(".", import.meta.url).pathname; +const V2_ROOT = resolve(DASH_DIR, ".."); +const RESULTS_DIR = join(V2_ROOT, "results"); +const PORT = Number(process.env.DASH_PORT ?? 4180); + +const MIME: Record = { + html: "text/html; charset=utf-8", + js: "application/javascript; charset=utf-8", + css: "text/css; charset=utf-8", + json: "application/json; charset=utf-8", + jsonl: "application/x-ndjson; charset=utf-8", +}; + +function contentType(path: string): string { + const ext = path.split(".").pop() ?? ""; + return MIME[ext] ?? "text/plain; charset=utf-8"; +} + +Bun.serve({ + port: PORT, + async fetch(req) { + const url = new URL(req.url); + const path = decodeURIComponent(url.pathname); + + if (path === "/sweeps") { + if (!existsSync(RESULTS_DIR)) return Response.json([]); + const sweeps: string[] = []; + for (const entry of readdirSync(RESULTS_DIR)) { + const runsPath = join(RESULTS_DIR, entry, "runs.jsonl"); + if (existsSync(runsPath)) sweeps.push(entry); + } + sweeps.sort((a, b) => { + const ma = statSync(join(RESULTS_DIR, a, "runs.jsonl")).mtimeMs; + const mb = statSync(join(RESULTS_DIR, b, "runs.jsonl")).mtimeMs; + return ma - mb; + }); + return Response.json(sweeps); + } + + if (path.startsWith("/results/")) { + const rel = path.slice("/results/".length); + const file = resolve(RESULTS_DIR, rel); + if (!file.startsWith(RESULTS_DIR)) return new Response("forbidden", { status: 403 }); + if (!existsSync(file) || !statSync(file).isFile()) return new Response("not found", { status: 404 }); + return new Response(readFileSync(file), { headers: { "Content-Type": contentType(file) } }); + } + + // Dashboard static files + const localPath = path === "/" ? "/index.html" : path; + const file = resolve(DASH_DIR, `.${localPath}`); + if (!file.startsWith(DASH_DIR)) return new Response("forbidden", { status: 403 }); + if (existsSync(file) && statSync(file).isFile()) { + return new Response(readFileSync(file), { headers: { "Content-Type": contentType(file) } }); + } + return new Response("not found", { status: 404 }); + }, +}); + +console.log(`[dashboard] http://localhost:${PORT}`); +console.log(`[dashboard] serving dashboard from ${DASH_DIR}`); +console.log(`[dashboard] serving results from ${RESULTS_DIR}`); diff --git a/benchmarks/v2/metrics/aggregate.ts b/benchmarks/v2/metrics/aggregate.ts new file mode 100644 index 0000000..e0415ff --- /dev/null +++ b/benchmarks/v2/metrics/aggregate.ts @@ -0,0 +1,104 @@ +import { readFileSync, writeFileSync } from "node:fs"; +import { dirname, join } from "node:path"; +import { parseArgs } from "node:util"; +import { aggregateCells, loadRuns, type CellAggregate, type NumericAggregate } from "./stats.ts"; +import { cellLabel } from "../runner/hash.ts"; + +const { values } = parseArgs({ + options: { + input: { type: "string" }, + json: { type: "boolean", default: false }, + }, +}); + +const input = values.input; +if (!input) { + console.error("usage: bun run metrics/aggregate.ts --input results//runs.jsonl [--json]"); + process.exit(1); +} + +const raw = readFileSync(input, "utf8").split("\n"); +const records = loadRuns(raw); +if (records.length === 0) { + console.error(`no run records found in ${input}`); + process.exit(1); +} +const aggregates = aggregateCells(records); + +if (values.json) { + const out = join(dirname(input), "aggregated.json"); + writeFileSync(out, JSON.stringify({ source: input, runs: records.length, cells: aggregates }, null, 2)); + console.log(`wrote ${out}`); +} + +printTable(aggregates); + +function printTable(cells: CellAggregate[]) { + cells.sort((a, b) => a.cellId.localeCompare(b.cellId)); + const header = [ + "cell".padEnd(18), + "N".padStart(3), + "pass%".padStart(6), + "tok̄".padStart(7), + "tok₉₅".padStart(7), + "ctxMx".padStart(6), + "turns̄".padStart(7), + "calls̄".padStart(7), + "spec%".padStart(6), + "t̄(s)".padStart(6), + "$/✓".padStart(8), + "tok/✓".padStart(9), + "label", + ]; + console.log(header.join(" ")); + for (const c of cells) { + const passPct = `${(c.passRate * 100).toFixed(0)}%`; + const specPct = `${(c.specComplianceRate.mean * 100).toFixed(0)}%`; + const label = cellLabel({ ...c.cell, iteration: 0 }).replace(` | iter=0`, ""); + console.log( + [ + c.cellId.slice(0, 16).padEnd(18), + String(c.runs).padStart(3), + passPct.padStart(6), + fmt(c.totalTokens, 0).padStart(7), + fmt95(c.totalTokens).padStart(7), + fmtNum(c.maxContextTokens.mean, 0).padStart(6), + fmt(c.turns, 1).padStart(7), + fmt(c.toolCalls, 1).padStart(7), + specPct.padStart(6), + fmtNum(c.totalTimeMs.mean / 1000, 1).padStart(6), + fmtCost(c.costPerSuccess).padStart(8), + fmtNum(c.tokensPerSuccess, 0).padStart(9), + label, + ].join(" "), + ); + } + const totals = { + runs: records.length, + pass: records.filter((r) => r.metrics?.verification?.passed).length, + }; + const passRate = totals.runs > 0 ? (totals.pass / totals.runs) * 100 : 0; + console.log(`\n${totals.runs} runs, ${totals.pass} pass (${passRate.toFixed(0)}%), ${cells.length} unique cells`); +} + +function fmt(agg: NumericAggregate, digits: number): string { + if (agg.count === 0) return "–"; + return agg.mean.toLocaleString(undefined, { maximumFractionDigits: digits }); +} + +function fmt95(agg: NumericAggregate): string { + if (agg.count === 0) return "–"; + return agg.p95.toLocaleString(undefined, { maximumFractionDigits: 0 }); +} + +function fmtNum(n: number, digits: number): string { + if (!Number.isFinite(n)) return "–"; + return n.toLocaleString(undefined, { maximumFractionDigits: digits }); +} + +function fmtCost(n: number): string { + if (!Number.isFinite(n)) return "∞"; + if (n === 0) return "$0"; + if (n < 0.01) return `$${n.toFixed(4)}`; + return `$${n.toFixed(3)}`; +} diff --git a/benchmarks/v2/metrics/cost.ts b/benchmarks/v2/metrics/cost.ts new file mode 100644 index 0000000..e5b0513 --- /dev/null +++ b/benchmarks/v2/metrics/cost.ts @@ -0,0 +1,40 @@ +import type { ProviderConfig } from "../runner/types.ts"; + +/** + * Pricing per million tokens (USD). Local models on DGX cost $0 — we track + * them with zeros so cost-per-success is consistent across the matrix and the + * dashboard can still show the ratio. + */ +export interface ModelPricing { + inputPerMillion: number; + outputPerMillion: number; +} + +export const PRICING: Record = { + // Local — DGX Ollama + "gemma4:31b": { inputPerMillion: 0, outputPerMillion: 0 }, + "gemma4:e2b-it": { inputPerMillion: 0, outputPerMillion: 0 }, + "gemma4:e4b-it": { inputPerMillion: 0, outputPerMillion: 0 }, + "gemma4:26b-a4b-it": { inputPerMillion: 0, outputPerMillion: 0 }, + "gemma4:31b-it": { inputPerMillion: 0, outputPerMillion: 0 }, + "nemotron-3-super:120b": { inputPerMillion: 0, outputPerMillion: 0 }, + // API reference anchors (mirror v1 pricing table). + "gemini-2.5-flash": { inputPerMillion: 0.3, outputPerMillion: 2.5 }, + "gemini-2.5-pro": { inputPerMillion: 1.25, outputPerMillion: 10.0 }, + "gpt-4.1-nano": { inputPerMillion: 0.1, outputPerMillion: 0.4 }, + "gpt-4.1-mini": { inputPerMillion: 0.4, outputPerMillion: 1.6 }, + "gpt-4.1": { inputPerMillion: 2.0, outputPerMillion: 8.0 }, + "claude-sonnet-4": { inputPerMillion: 3.0, outputPerMillion: 15.0 }, + "claude-opus-4": { inputPerMillion: 15.0, outputPerMillion: 75.0 }, +}; + +export function estimateCostUsd(provider: ProviderConfig, inputTokens: number, outputTokens: number): number { + const p = PRICING[provider.model]; + if (!p) return 0; + return (inputTokens / 1_000_000) * p.inputPerMillion + (outputTokens / 1_000_000) * p.outputPerMillion; +} + +export function isLocal(provider: ProviderConfig): boolean { + const p = PRICING[provider.model]; + return p !== undefined && p.inputPerMillion === 0 && p.outputPerMillion === 0; +} diff --git a/benchmarks/v2/metrics/stats.ts b/benchmarks/v2/metrics/stats.ts new file mode 100644 index 0000000..ad2f0db --- /dev/null +++ b/benchmarks/v2/metrics/stats.ts @@ -0,0 +1,206 @@ +import type { Cell, RunRecord } from "../runner/types.ts"; +import { estimateCostUsd } from "./cost.ts"; +import { categorizeRun, isSuccess, summarizeCategories, type FailureCategory } from "./taxonomy.ts"; + +export interface NumericAggregate { + count: number; + mean: number; + median: number; + p95: number; + stdev: number; + min: number; + max: number; + /** Bootstrap 95% CI on the mean. null if too few samples to bootstrap. */ + ci95?: [number, number]; +} + +export interface CellAggregate { + cellId: string; + cell: Cell; + runs: number; + passRate: number; + failureCategories: Record; + totalTokens: NumericAggregate; + inputTokens: NumericAggregate; + outputTokens: NumericAggregate; + maxContextTokens: NumericAggregate; + turns: NumericAggregate; + toolCalls: NumericAggregate; + specComplianceRate: NumericAggregate; + llmTimeMs: NumericAggregate; + totalTimeMs: NumericAggregate; + timeToFirstToolCallMs: NumericAggregate; + transportBytes: NumericAggregate; + costUsd: NumericAggregate; + /** Cost per successful run. Infinity when passRate is 0. */ + costPerSuccess: number; + /** Tokens per successful run. */ + tokensPerSuccess: number; +} + +export function aggregateCells(runs: RunRecord[]): CellAggregate[] { + // Dedup by runId, preferring successful records over errored ones. This + // handles resume-after-fix: when a cell errored, got fixed, and re-ran, + // the jsonl ends up with two records sharing the same runId — one ERR, + // one PASS. Count the PASS and drop the ERR. + const byRunId = new Map(); + for (const r of runs) { + if (!r.runId) continue; + const existing = byRunId.get(r.runId); + if (!existing) { + byRunId.set(r.runId, r); + continue; + } + const existingHasMetrics = !!existing.metrics && !existing.error; + const currentHasMetrics = !!r.metrics && !r.error; + if (!existingHasMetrics && currentHasMetrics) byRunId.set(r.runId, r); + } + + const buckets = new Map(); + for (const r of byRunId.values()) { + if (!r.cellId) continue; + const bucket = buckets.get(r.cellId) ?? []; + bucket.push(r); + buckets.set(r.cellId, bucket); + } + + const out: CellAggregate[] = []; + for (const [cellId, cellRuns] of buckets) { + const first = cellRuns[0]; + const metrics = cellRuns.map((r) => r.metrics).filter((m): m is NonNullable => !!m); + const passCount = cellRuns.filter(isSuccess).length; + const passRate = cellRuns.length > 0 ? passCount / cellRuns.length : 0; + + const agg = (pick: (m: (typeof metrics)[number]) => number): NumericAggregate => + numericAggregate(metrics.map(pick)); + + const cell = first.cell; + const total = agg((m) => m.totalTokens); + const cost = agg((m) => estimateCostUsd(cell.provider, m.inputTokens, m.outputTokens)); + + out.push({ + cellId, + cell, + runs: cellRuns.length, + passRate, + failureCategories: summarizeCategories(cellRuns), + totalTokens: total, + inputTokens: agg((m) => m.inputTokens), + outputTokens: agg((m) => m.outputTokens), + maxContextTokens: agg((m) => m.maxContextTokens), + turns: agg((m) => m.turns), + toolCalls: agg((m) => m.toolCalls), + specComplianceRate: agg((m) => m.specComplianceRate), + llmTimeMs: agg((m) => m.llmTimeMs), + totalTimeMs: agg((m) => m.totalTimeMs), + timeToFirstToolCallMs: agg((m) => m.timeToFirstToolCallMs ?? Number.NaN), + transportBytes: agg((m) => m.transportBytesSent + m.transportBytesReceived), + costUsd: cost, + costPerSuccess: passCount > 0 ? (cost.mean * cellRuns.length) / passCount : Number.POSITIVE_INFINITY, + tokensPerSuccess: passCount > 0 ? (total.mean * cellRuns.length) / passCount : Number.POSITIVE_INFINITY, + }); + } + return out; +} + +export function numericAggregate(raw: number[]): NumericAggregate { + const values = raw.filter((v) => Number.isFinite(v)); + const count = values.length; + if (count === 0) { + return { count: 0, mean: 0, median: 0, p95: 0, stdev: 0, min: 0, max: 0 }; + } + const sorted = [...values].sort((a, b) => a - b); + const mean = values.reduce((a, b) => a + b, 0) / count; + const median = percentile(sorted, 0.5); + const p95 = percentile(sorted, 0.95); + const stdev = count > 1 ? Math.sqrt(values.reduce((acc, v) => acc + (v - mean) ** 2, 0) / (count - 1)) : 0; + const min = sorted[0]; + const max = sorted[sorted.length - 1]; + const agg: NumericAggregate = { count, mean, median, p95, stdev, min, max }; + if (count >= 5) agg.ci95 = bootstrapCiMean(values); + return agg; +} + +function percentile(sorted: number[], q: number): number { + if (sorted.length === 0) return 0; + if (sorted.length === 1) return sorted[0]; + const idx = q * (sorted.length - 1); + const lo = Math.floor(idx); + const hi = Math.ceil(idx); + if (lo === hi) return sorted[lo]; + return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo); +} + +/** + * Nonparametric bootstrap 95% CI for the mean. Resamples with replacement + * 2000 times. Good enough for a dev-facing dashboard; swap in BCa later if we + * need bias correction. + */ +function bootstrapCiMean(values: number[]): [number, number] { + const B = 2000; + const means: number[] = new Array(B); + const n = values.length; + for (let b = 0; b < B; b++) { + let sum = 0; + for (let i = 0; i < n; i++) sum += values[(Math.random() * n) | 0]; + means[b] = sum / n; + } + means.sort((a, b) => a - b); + return [means[Math.floor(B * 0.025)], means[Math.floor(B * 0.975)]]; +} + +/** + * Welch's t-test on two independent samples. Returns t and an approximate + * two-sided p-value using a normal approximation when either sample is + * small enough that the full t-distribution matters less than the rough + * signal. Good enough to colour dashboard rows; not good enough to publish. + */ +export function welchTTest(a: number[], b: number[]): { t: number; pTwoSided: number } | null { + const na = a.length; + const nb = b.length; + if (na < 2 || nb < 2) return null; + const meanA = a.reduce((x, y) => x + y, 0) / na; + const meanB = b.reduce((x, y) => x + y, 0) / nb; + const varA = a.reduce((acc, v) => acc + (v - meanA) ** 2, 0) / (na - 1); + const varB = b.reduce((acc, v) => acc + (v - meanB) ** 2, 0) / (nb - 1); + const se = Math.sqrt(varA / na + varB / nb); + if (se === 0) return { t: 0, pTwoSided: 1 }; + const t = (meanA - meanB) / se; + // Normal approximation to the two-sided p-value. + const pTwoSided = 2 * (1 - phi(Math.abs(t))); + return { t, pTwoSided }; +} + +function phi(x: number): number { + // Abramowitz & Stegun 7.1.26 approximation for standard normal CDF. + const a1 = 0.254829592; + const a2 = -0.284496736; + const a3 = 1.421413741; + const a4 = -1.453152027; + const a5 = 1.061405429; + const p = 0.3275911; + const sign = x < 0 ? -1 : 1; + const ax = Math.abs(x) / Math.sqrt(2); + const t = 1 / (1 + p * ax); + const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-ax * ax); + return 0.5 * (1 + sign * y); +} + +export function loadRuns(lines: string[]): RunRecord[] { + const out: RunRecord[] = []; + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed) continue; + const obj = JSON.parse(trimmed); + if (obj && typeof obj === "object" && "cellId" in obj) out.push(obj as RunRecord); + } + return out; +} + +export function categoryCounts(records: RunRecord[]) { + const out: Record = {}; + for (const r of records) { + for (const c of categorizeRun(r)) out[c] = (out[c] ?? 0) + 1; + } + return out; +} diff --git a/benchmarks/v2/metrics/taxonomy.ts b/benchmarks/v2/metrics/taxonomy.ts new file mode 100644 index 0000000..9c93dd8 --- /dev/null +++ b/benchmarks/v2/metrics/taxonomy.ts @@ -0,0 +1,73 @@ +import type { CellMetrics, RunRecord } from "../runner/types.ts"; + +/** + * Failure taxonomy — why did this run fall short? A run may land in multiple + * buckets (max_turns *and* verification failure) so we return a set. + * + * Categories: + * - `ok` — passed verification cleanly + * - `no_verifier` — scenario has no verifier; we can't score it + * - `verify_fail` — verifier returned passed=false + * - `max_turns` — ran out of budget before finishing + * - `tool_unknown` — agent called a tool that didn't exist on the tree at that moment + * - `tool_invoke_error` — affordance was valid but invoke() threw + * - `tool_param_error` — affordance was valid but args were malformed + * - `cell_exception` — runner itself threw (network, server crash, …) + */ +export type FailureCategory = + | "ok" + | "no_verifier" + | "verify_fail" + | "max_turns" + | "tool_unknown" + | "tool_invoke_error" + | "tool_param_error" + | "cell_exception"; + +export function categorizeRun(record: RunRecord): FailureCategory[] { + const cats = new Set(); + if (record.error) cats.add("cell_exception"); + const m = record.metrics; + if (!m) return Array.from(cats); + + if (m.unknownToolCalls > 0) cats.add("tool_unknown"); + if (m.invokeErrorCalls > 0) cats.add("tool_invoke_error"); + if (m.paramErrorCalls > 0) cats.add("tool_param_error"); + if (m.finishReason === "max_turns") cats.add("max_turns"); + + if (!m.verification) { + cats.add("no_verifier"); + } else if (!m.verification.passed) { + cats.add("verify_fail"); + } + + if (m.verification?.passed && cats.size === 0) cats.add("ok"); + if (cats.size === 0) cats.add("ok"); + return Array.from(cats); +} + +export function isSuccess(record: RunRecord): boolean { + if (record.error) return false; + return record.metrics?.verification?.passed === true; +} + +export function summarizeCategories(records: RunRecord[]): Record { + const counts: Record = { + ok: 0, + no_verifier: 0, + verify_fail: 0, + max_turns: 0, + tool_unknown: 0, + tool_invoke_error: 0, + tool_param_error: 0, + cell_exception: 0, + }; + for (const r of records) { + for (const cat of categorizeRun(r)) counts[cat] += 1; + } + return counts; +} + +export function _cellMetricsForTypeCheck(_m: CellMetrics) { + // Exists so CellMetrics import isn't dropped — used by categorizeRun above via record.metrics. +} diff --git a/benchmarks/v2/package.json b/benchmarks/v2/package.json new file mode 100644 index 0000000..b387bc2 --- /dev/null +++ b/benchmarks/v2/package.json @@ -0,0 +1,20 @@ +{ + "name": "slop-benchmarks-v2", + "private": true, + "type": "module", + "scripts": { + "smoke": "bun run smoke/provider-test.ts", + "sweep": "bun run run.ts", + "aggregate": "bun run metrics/aggregate.ts", + "dash": "bun run dashboard/serve.ts" + }, + "dependencies": { + "@slop-ai/server": "workspace:*", + "@slop-ai/consumer": "workspace:*", + "@slop-ai/core": "workspace:*", + "@modelcontextprotocol/sdk": "^1.29.0" + }, + "devDependencies": { + "bun-types": "^1.3.11" + } +} diff --git a/benchmarks/v2/providers/openai-compat.ts b/benchmarks/v2/providers/openai-compat.ts new file mode 100644 index 0000000..6a681cc --- /dev/null +++ b/benchmarks/v2/providers/openai-compat.ts @@ -0,0 +1,184 @@ +import type { + ChatMessage, + FinishReason, + GenerateRequest, + GenerateResponse, + LlmProvider, + ToolCall, +} from "./types.ts"; + +export interface OpenAICompatOptions { + baseUrl: string; + model: string; + apiKey?: string; + id?: string; + requestTimeoutMs?: number; +} + +interface OpenAIToolCall { + id: string; + type: "function"; + function: { name: string; arguments: string }; +} + +interface OpenAIMessage { + role: "system" | "user" | "assistant" | "tool"; + content: string | null; + tool_calls?: OpenAIToolCall[]; + tool_call_id?: string; + name?: string; +} + +interface OpenAIChatResponse { + choices: Array<{ + index: number; + message: OpenAIMessage; + finish_reason: string | null; + }>; + usage?: { + prompt_tokens?: number; + completion_tokens?: number; + total_tokens?: number; + }; +} + +export class OpenAICompatProvider implements LlmProvider { + readonly id: string; + readonly model: string; + private readonly baseUrl: string; + private readonly apiKey: string; + private readonly timeoutMs: number; + + constructor(opts: OpenAICompatOptions) { + this.baseUrl = opts.baseUrl.replace(/\/$/, ""); + this.model = opts.model; + this.apiKey = opts.apiKey ?? "dummy-key"; + this.id = opts.id ?? `openai-compat:${opts.model}`; + this.timeoutMs = opts.requestTimeoutMs ?? 180_000; + } + + async generate(req: GenerateRequest): Promise { + const messages: OpenAIMessage[] = [ + { role: "system", content: req.systemPrompt }, + ...req.messages.map(toOpenAIMessage), + ]; + + const body = { + model: this.model, + messages, + tools: req.tools.map((t) => ({ + type: "function" as const, + function: { + name: t.name, + description: t.description, + parameters: t.parameters, + }, + })), + tool_choice: req.tools.length > 0 ? "auto" : undefined, + temperature: req.temperature ?? 0, + max_tokens: req.maxTokens, + }; + + const t0 = performance.now(); + const controller = new AbortController(); + const timer = setTimeout(() => controller.abort(), this.timeoutMs); + let res: Response; + try { + res = await fetch(`${this.baseUrl}/chat/completions`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }, + body: JSON.stringify(body), + signal: controller.signal, + }); + } finally { + clearTimeout(timer); + } + const rawLatencyMs = performance.now() - t0; + + if (!res.ok) { + const text = await res.text().catch(() => ""); + throw new Error(`OpenAI-compat ${this.baseUrl} ${res.status}: ${text.slice(0, 500)}`); + } + + const json = (await res.json()) as OpenAIChatResponse; + const choice = json.choices[0]; + if (!choice) throw new Error("OpenAI-compat response has no choices"); + const message = fromOpenAIMessage(choice.message); + const finishReason = normaliseFinishReason(choice.finish_reason); + const usage = { + inputTokens: json.usage?.prompt_tokens ?? 0, + outputTokens: json.usage?.completion_tokens ?? 0, + totalTokens: + json.usage?.total_tokens ?? + (json.usage?.prompt_tokens ?? 0) + (json.usage?.completion_tokens ?? 0), + }; + + return { message, usage, finishReason, rawLatencyMs }; + } +} + +function toOpenAIMessage(m: ChatMessage): OpenAIMessage { + if (m.role === "assistant" && m.toolCalls && m.toolCalls.length > 0) { + return { + role: "assistant", + content: m.content, + tool_calls: m.toolCalls.map((c) => ({ + id: c.id, + type: "function", + function: { name: c.name, arguments: JSON.stringify(c.arguments) }, + })), + }; + } + if (m.role === "tool") { + return { + role: "tool", + content: m.content, + tool_call_id: m.toolCallId, + name: m.name, + }; + } + return { role: m.role, content: m.content }; +} + +function fromOpenAIMessage(m: OpenAIMessage): ChatMessage { + const toolCalls: ToolCall[] | undefined = m.tool_calls?.map((c) => ({ + id: c.id, + name: c.function.name, + arguments: parseArgs(c.function.arguments), + })); + return { + role: (m.role as ChatMessage["role"]) ?? "assistant", + content: m.content ?? "", + toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : undefined, + }; +} + +function parseArgs(raw: string): Record { + if (!raw) return {}; + try { + const parsed = JSON.parse(raw); + return typeof parsed === "object" && parsed !== null ? (parsed as Record) : { value: parsed }; + } catch { + return { _raw: raw }; + } +} + +function normaliseFinishReason(raw: string | null): FinishReason { + switch (raw) { + case "tool_calls": + case "function_call": + return "tool_calls"; + case "length": + case "max_tokens": + return "length"; + case "stop": + case "end_turn": + case null: + return "stop"; + default: + return "stop"; + } +} diff --git a/benchmarks/v2/providers/types.ts b/benchmarks/v2/providers/types.ts new file mode 100644 index 0000000..87f7506 --- /dev/null +++ b/benchmarks/v2/providers/types.ts @@ -0,0 +1,50 @@ +export type ChatRole = "system" | "user" | "assistant" | "tool"; + +export interface ToolCall { + id: string; + name: string; + arguments: Record; +} + +export interface ChatMessage { + role: ChatRole; + content: string; + toolCalls?: ToolCall[]; + toolCallId?: string; + name?: string; +} + +export interface ToolDef { + name: string; + description: string; + parameters: Record; +} + +export interface LlmUsage { + inputTokens: number; + outputTokens: number; + totalTokens: number; +} + +export type FinishReason = "stop" | "tool_calls" | "length" | "error"; + +export interface GenerateRequest { + systemPrompt: string; + messages: ChatMessage[]; + tools: ToolDef[]; + temperature?: number; + maxTokens?: number; +} + +export interface GenerateResponse { + message: ChatMessage; + usage: LlmUsage; + finishReason: FinishReason; + rawLatencyMs: number; +} + +export interface LlmProvider { + readonly id: string; + readonly model: string; + generate(req: GenerateRequest): Promise; +} diff --git a/benchmarks/v2/results/ablation-prompts-encodings/runs.jsonl b/benchmarks/v2/results/ablation-prompts-encodings/runs.jsonl new file mode 100644 index 0000000..28c2e38 --- /dev/null +++ b/benchmarks/v2/results/ablation-prompts-encodings/runs.jsonl @@ -0,0 +1,13 @@ +{"type":"sweep","config":{"id":"ablation-prompts-encodings","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["minimal","spec","spec-terse"],"encodingVariants":["indented-text","json-compact","markdown-headings"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat","flat+prompt"],"apps":["todo"],"dataScales":["s"],"scenarioFilter":["mark-all-done"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":30,"temperature":0},"startedAt":"2026-04-15T13:14:03.671Z"} +{"sweepId":"ablation-prompts-encodings","cellId":"6bff88133d655644","runId":"ablation-prompts-encodings:6bff88133d655644:0","configHash":"6bff88133d655644","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"minimal","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:14:03.672Z","durationMs":51585.324666,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":2757,"outputTokens":442,"totalTokens":3199,"maxContextTokens":1456,"timeToFirstToolCallMs":33147.140999999996,"setupTimeMs":2.0802090000000106,"llmTimeMs":51566.906832999994,"totalTimeMs":51584.657750000006,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":1301,"outputTokens":262,"latencyMs":33147.126874999994,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":1456,"outputTokens":180,"latencyMs":18419.779958,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"0c2aa43697715c54","runId":"ablation-prompts-encodings:0c2aa43697715c54:0","configHash":"0c2aa43697715c54","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"minimal","encoding":"json-compact","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:14:55.258Z","durationMs":55335.918750000004,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":5597,"outputTokens":516,"totalTokens":6113,"maxContextTokens":2876,"timeToFirstToolCallMs":37305.58974999999,"setupTimeMs":2.161500000001979,"llmTimeMs":55322.11345799999,"totalTimeMs":55335.858958,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2721,"outputTokens":341,"latencyMs":37305.579207999996,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":2876,"outputTokens":175,"latencyMs":18016.534249999997,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"7de01e731db22914","runId":"ablation-prompts-encodings:7de01e731db22914:0","configHash":"7de01e731db22914","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"minimal","encoding":"markdown-headings","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:15:50.594Z","durationMs":137832.292292,"metrics":{"turns":4,"toolCalls":12,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":6,"inputTokens":8458,"outputTokens":1357,"totalTokens":9815,"maxContextTokens":2287,"timeToFirstToolCallMs":35589.190791000015,"setupTimeMs":2.3482090000034077,"llmTimeMs":137818.866334,"totalTimeMs":137832.177208,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":0.5,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":1868,"outputTokens":337,"latencyMs":35589.181291999994,"toolCalls":6,"toolCallKinds":["param_error","param_error","param_error","param_error","param_error","param_error"]},{"index":1,"inputTokens":2132,"outputTokens":622,"latencyMs":61173.34299999999,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":2,"inputTokens":2171,"outputTokens":392,"latencyMs":39279.158041999995,"toolCalls":5,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance"]},{"index":3,"inputTokens":2287,"outputTokens":6,"latencyMs":1777.1840000000084,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"9dae0af9aee37f10","runId":"ablation-prompts-encodings:9dae0af9aee37f10:0","configHash":"9dae0af9aee37f10","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:18:08.427Z","durationMs":47738.54904099999,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":4595,"outputTokens":448,"totalTokens":5043,"maxContextTokens":2375,"timeToFirstToolCallMs":30023.628457999963,"setupTimeMs":2.592833000002429,"llmTimeMs":47719.666708000004,"totalTimeMs":47738.48087500001,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2220,"outputTokens":275,"latencyMs":30023.612250000006,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":2375,"outputTokens":173,"latencyMs":17696.054458,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"96e02e4e7de9a3b7","runId":"ablation-prompts-encodings:96e02e4e7de9a3b7:0","configHash":"96e02e4e7de9a3b7","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"json-compact","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:18:56.165Z","durationMs":56172.179000000004,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":7435,"outputTokens":522,"totalTokens":7957,"maxContextTokens":3795,"timeToFirstToolCallMs":38336.83799999999,"setupTimeMs":1.1343749999650754,"llmTimeMs":56164.567332999955,"totalTimeMs":56172.16070800001,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3640,"outputTokens":349,"latencyMs":38336.81191599998,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":3795,"outputTokens":173,"latencyMs":17827.75541699998,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"e0ef4d76686f08ea","runId":"ablation-prompts-encodings:e0ef4d76686f08ea:0","configHash":"e0ef4d76686f08ea","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"markdown-headings","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:42:39.360Z","durationMs":333329.89158299996,"metrics":{"turns":8,"toolCalls":16,"navigationToolCalls":2,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":8,"inputTokens":32583,"outputTokens":3029,"totalTokens":35612,"maxContextTokens":5844,"timeToFirstToolCallMs":42380.975875000004,"setupTimeMs":1.8915829999999971,"llmTimeMs":333310.26345999993,"totalTimeMs":333329.25654200005,"transportBytesSent":486,"transportBytesReceived":7378,"specComplianceRate":0.42857142857142855,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2787,"outputTokens":333,"latencyMs":42380.956999999995,"toolCalls":6,"toolCallKinds":["param_error","param_error","param_error","param_error","param_error","param_error"]},{"index":1,"inputTokens":3051,"outputTokens":351,"latencyMs":35039.515416999995,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":2,"inputTokens":3095,"outputTokens":449,"latencyMs":44300.983374999996,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":3,"inputTokens":4370,"outputTokens":218,"latencyMs":25731.645292,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":4,"inputTokens":4414,"outputTokens":963,"latencyMs":104567.07620899999,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":5,"inputTokens":4453,"outputTokens":502,"latencyMs":53954.17258300001,"toolCalls":5,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance"]},{"index":6,"inputTokens":4569,"outputTokens":207,"latencyMs":24500.79916699999,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":7,"inputTokens":5844,"outputTokens":6,"latencyMs":2835.114416999975,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"1254f7f4faea113f","runId":"ablation-prompts-encodings:1254f7f4faea113f:0","configHash":"1254f7f4faea113f","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec-terse","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:48:12.691Z","durationMs":45359.022791000025,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":3071,"outputTokens":436,"totalTokens":3507,"maxContextTokens":1613,"timeToFirstToolCallMs":27565.84154200001,"setupTimeMs":2.5538329999544658,"llmTimeMs":45341.406999,"totalTimeMs":45358.954792000004,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":1458,"outputTokens":262,"latencyMs":27565.833333000017,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":1613,"outputTokens":174,"latencyMs":17775.573665999982,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"0d9f06f9023716ff","runId":"ablation-prompts-encodings:0d9f06f9023716ff:0","configHash":"0d9f06f9023716ff","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec-terse","encoding":"json-compact","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:48:58.051Z","durationMs":74536.82708299998,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":1,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":11137,"outputTokens":655,"totalTokens":11792,"maxContextTokens":5226,"timeToFirstToolCallMs":35114.534208,"setupTimeMs":2.117917000025045,"llmTimeMs":74523.08412600006,"totalTimeMs":74535.90179199999,"transportBytesSent":486,"transportBytesReceived":8573,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2878,"outputTokens":318,"latencyMs":35114.518209,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":3033,"outputTokens":169,"latencyMs":17403.150667000038,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":2,"inputTokens":5226,"outputTokens":168,"latencyMs":22005.41525000002,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"2d4ee9f9bcedd11f","runId":"ablation-prompts-encodings:2d4ee9f9bcedd11f:0","configHash":"2d4ee9f9bcedd11f","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec-terse","encoding":"markdown-headings","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:50:12.590Z","durationMs":483956.34754199994,"metrics":{"turns":9,"toolCalls":17,"navigationToolCalls":2,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":9,"inputTokens":29054,"outputTokens":4669,"totalTokens":33723,"maxContextTokens":5121,"timeToFirstToolCallMs":37194.10391599999,"setupTimeMs":2.271749999956228,"llmTimeMs":483941.40420899994,"totalTimeMs":483956.22274999996,"transportBytesSent":486,"transportBytesReceived":7378,"specComplianceRate":0.4,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2025,"outputTokens":352,"latencyMs":37194.09091700002,"toolCalls":6,"toolCallKinds":["param_error","param_error","param_error","param_error","param_error","param_error"]},{"index":1,"inputTokens":2289,"outputTokens":510,"latencyMs":50299.15070900001,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":2,"inputTokens":2333,"outputTokens":515,"latencyMs":50488.169292000006,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":3,"inputTokens":2372,"outputTokens":1082,"latencyMs":105892.05133399996,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":4,"inputTokens":3647,"outputTokens":854,"latencyMs":89824.584333,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":5,"inputTokens":3691,"outputTokens":950,"latencyMs":101650.54083299998,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":6,"inputTokens":3730,"outputTokens":224,"latencyMs":25251.111874999944,"toolCalls":5,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance"]},{"index":7,"inputTokens":3846,"outputTokens":176,"latencyMs":20583.621374999988,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":8,"inputTokens":5121,"outputTokens":6,"latencyMs":2758.083541000029,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"5897a9956995fa0d","runId":"ablation-prompts-encodings:5897a9956995fa0d:0","configHash":"5897a9956995fa0d","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:58:16.547Z","durationMs":34546.89762499998,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":2611,"outputTokens":337,"totalTokens":2948,"maxContextTokens":1151,"timeToFirstToolCallMs":9433.217209000024,"setupTimeMs":86.30054099997506,"llmTimeMs":34428.401083000004,"totalTimeMs":34544.17337500001,"transportBytesSent":324,"transportBytesReceived":1102,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":549,"outputTokens":90,"latencyMs":9433.19070799998,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":911,"outputTokens":241,"latencyMs":23952.858125000028,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":2,"inputTokens":1151,"outputTokens":6,"latencyMs":1042.3522499999963,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"ablation-prompts-encodings","cellId":"cc35c74e8c023f04","runId":"ablation-prompts-encodings:cc35c74e8c023f04:0","configHash":"cc35c74e8c023f04","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat+prompt","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:58:51.094Z","durationMs":1.0311659999424592,"error":"todo: MCP variant \"flat+prompt\" not yet implemented\nError: todo: MCP variant \"flat+prompt\" not yet implemented\n at startMcpServer (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/apps/todo/index.ts:46:17)\n at startMcpServer (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/apps/todo/index.ts:44:24)\n at runMcpCell (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/runner/mcp-cell.ts:41:28)\n at runMcpCell (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/runner/mcp-cell.ts:12:33)\n at runSweep (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/runner/sweep.ts:106:31)\n at processTicksAndRejections (native:7:39)"} +{"sweepId":"ablation-prompts-encodings","cellId":"cc35c74e8c023f04","runId":"ablation-prompts-encodings:cc35c74e8c023f04:0","configHash":"cc35c74e8c023f04","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat+prompt","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T14:00:16.743Z","durationMs":31581.046041999998,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":3220,"outputTokens":300,"totalTokens":3520,"maxContextTokens":1354,"timeToFirstToolCallMs":6410.151417,"setupTimeMs":60.256874999999994,"llmTimeMs":31495.081041,"totalTimeMs":31580.590166,"transportBytesSent":324,"transportBytesReceived":1102,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":752,"outputTokens":53,"latencyMs":6410.142416,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":1114,"outputTokens":241,"latencyMs":24035.955083,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":2,"inputTokens":1354,"outputTokens":6,"latencyMs":1048.9835420000018,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} diff --git a/benchmarks/v2/results/smoke-crm/runs.jsonl b/benchmarks/v2/results/smoke-crm/runs.jsonl new file mode 100644 index 0000000..aeda59c --- /dev/null +++ b/benchmarks/v2/results/smoke-crm/runs.jsonl @@ -0,0 +1,3 @@ +{"type":"sweep","config":{"id":"smoke-crm","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["crm"],"dataScales":["s"],"scenarioFilter":["high-value-alert"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":40,"temperature":0},"startedAt":"2026-04-15T12:36:01.444Z"} +{"sweepId":"smoke-crm","cellId":"ca3a9f06545dbc26","runId":"smoke-crm:ca3a9f06545dbc26:0","configHash":"ca3a9f06545dbc26","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"crm","scale":"s","scenario":"high-value-alert","seed":42,"iteration":0},"startedAt":"2026-04-15T12:36:01.445Z","durationMs":65072.22833300001,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8074,"outputTokens":549,"totalTokens":8623,"maxContextTokens":4217,"timeToFirstToolCallMs":62162.816999999995,"setupTimeMs":2.2873750000000115,"llmTimeMs":65049.417958,"totalTimeMs":65071.560084,"transportBytesSent":828,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3857,"outputTokens":543,"latencyMs":62162.801583,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":4217,"outputTokens":6,"latencyMs":2886.616374999998,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":6,"passedChecks":6,"failures":[]}}} +{"sweepId":"smoke-crm","cellId":"e43d5960201b9cf1","runId":"smoke-crm:e43d5960201b9cf1:0","configHash":"e43d5960201b9cf1","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"crm","scale":"s","scenario":"high-value-alert","seed":42,"iteration":0},"startedAt":"2026-04-15T12:37:06.519Z","durationMs":141165.18675,"metrics":{"turns":4,"toolCalls":13,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":6,"paramErrorCalls":0,"inputTokens":5579,"outputTokens":1420,"totalTokens":6999,"maxContextTokens":2017,"timeToFirstToolCallMs":11841.185708999998,"setupTimeMs":81.94433299999946,"llmTimeMs":141050.49129200002,"totalTimeMs":141164.1875,"transportBytesSent":1272,"transportBytesReceived":1732,"specComplianceRate":0.5384615384615384,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":872,"outputTokens":110,"latencyMs":11841.171208999993,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":1183,"outputTokens":677,"latencyMs":66173.42120800001,"toolCalls":6,"toolCallKinds":["invoke_error","invoke_error","invoke_error","invoke_error","invoke_error","invoke_error"]},{"index":2,"inputTokens":1507,"outputTokens":627,"latencyMs":61510.479500000016,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":3,"inputTokens":2017,"outputTokens":6,"latencyMs":1525.4193749999977,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":6,"passedChecks":6,"failures":[]}}} diff --git a/benchmarks/v2/results/smoke-file-browser/runs.jsonl b/benchmarks/v2/results/smoke-file-browser/runs.jsonl new file mode 100644 index 0000000..1c673dc --- /dev/null +++ b/benchmarks/v2/results/smoke-file-browser/runs.jsonl @@ -0,0 +1,3 @@ +{"type":"sweep","config":{"id":"smoke-file-browser","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["file-browser"],"dataScales":["s"],"scenarioFilter":["delete-empty-dirs"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":30,"temperature":0},"startedAt":"2026-04-15T12:42:04.925Z"} +{"sweepId":"smoke-file-browser","cellId":"bea6d0f1f4d3dde4","runId":"smoke-file-browser:bea6d0f1f4d3dde4:0","configHash":"bea6d0f1f4d3dde4","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"file-browser","scale":"s","scenario":"delete-empty-dirs","seed":42,"iteration":0},"startedAt":"2026-04-15T12:42:04.926Z","durationMs":214138.625,"metrics":{"turns":3,"toolCalls":2,"navigationToolCalls":1,"affordanceToolCalls":1,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":10442,"outputTokens":2000,"totalTokens":12442,"maxContextTokens":4451,"timeToFirstToolCallMs":44267.387458000005,"setupTimeMs":2.2589169999999967,"llmTimeMs":214127.14291700002,"totalTimeMs":214137.983083,"transportBytesSent":52,"transportBytesReceived":3741,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2982,"outputTokens":405,"latencyMs":44267.379417000004,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":3009,"outputTokens":374,"latencyMs":36754.872083,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":2,"inputTokens":4451,"outputTokens":1221,"latencyMs":133104.891417,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"smoke-file-browser","cellId":"566bedc1676bb5d4","runId":"smoke-file-browser:566bedc1676bb5d4:0","configHash":"566bedc1676bb5d4","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"file-browser","scale":"s","scenario":"delete-empty-dirs","seed":42,"iteration":0},"startedAt":"2026-04-15T12:45:39.065Z","durationMs":129256.30066599997,"metrics":{"turns":5,"toolCalls":4,"navigationToolCalls":0,"affordanceToolCalls":4,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":4236,"outputTokens":1317,"totalTokens":5553,"maxContextTokens":1163,"timeToFirstToolCallMs":16461.97116699998,"setupTimeMs":69.93870899998001,"llmTimeMs":129165.43254200005,"totalTimeMs":129255.48000000001,"transportBytesSent":168,"transportBytesReceived":1348,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":548,"outputTokens":164,"latencyMs":16461.958165999997,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":757,"outputTokens":212,"latencyMs":20751.344209000003,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":2,"inputTokens":791,"outputTokens":279,"latencyMs":27032.005584000028,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":3,"inputTokens":977,"outputTokens":331,"latencyMs":32366.21983300004,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":4,"inputTokens":1163,"outputTokens":331,"latencyMs":32553.904749999987,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} diff --git a/benchmarks/v2/results/smoke-mcp/runs.jsonl b/benchmarks/v2/results/smoke-mcp/runs.jsonl new file mode 100644 index 0000000..06a6b41 --- /dev/null +++ b/benchmarks/v2/results/smoke-mcp/runs.jsonl @@ -0,0 +1,3 @@ +{"type":"sweep","config":{"id":"smoke-mcp","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["issue-tracker"],"dataScales":["s"],"scenarioFilter":["explore-and-act"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":20,"temperature":0},"startedAt":"2026-04-15T12:12:19.983Z"} +{"sweepId":"smoke-mcp","cellId":"785d02dabebe52d5","runId":"smoke-mcp:785d02dabebe52d5:0","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T12:12:19.984Z","durationMs":60390.156749999995,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":432,"totalTokens":10035,"maxContextTokens":4895,"timeToFirstToolCallMs":42213.346625,"setupTimeMs":2.596624999999996,"llmTimeMs":60372.30433299999,"totalTimeMs":60389.474375,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":271,"latencyMs":42213.33775,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":161,"latencyMs":18158.966582999994,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} +{"sweepId":"smoke-mcp","cellId":"eef232b363ac08b8","runId":"smoke-mcp:eef232b363ac08b8:0","configHash":"eef232b363ac08b8","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T12:13:20.375Z","durationMs":33718.141541000005,"metrics":{"turns":4,"toolCalls":4,"navigationToolCalls":0,"affordanceToolCalls":4,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":5074,"outputTokens":315,"totalTokens":5389,"maxContextTokens":1590,"timeToFirstToolCallMs":10728.555625,"setupTimeMs":80.23566600000049,"llmTimeMs":33611.61062399999,"totalTimeMs":33717.270083,"transportBytesSent":368,"transportBytesReceived":1541,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":997,"outputTokens":95,"latencyMs":10728.548500000004,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":1078,"outputTokens":64,"latencyMs":6545.733207999991,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":2,"inputTokens":1409,"outputTokens":150,"latencyMs":15321.989333000005,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":3,"inputTokens":1590,"outputTokens":6,"latencyMs":1015.3395829999936,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} diff --git a/benchmarks/v2/results/smoke-todo/runs.jsonl b/benchmarks/v2/results/smoke-todo/runs.jsonl new file mode 100644 index 0000000..16ddbfd --- /dev/null +++ b/benchmarks/v2/results/smoke-todo/runs.jsonl @@ -0,0 +1,3 @@ +{"type":"sweep","config":{"id":"smoke-todo","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["todo"],"dataScales":["s"],"scenarioFilter":["mark-all-done"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":30,"temperature":0},"startedAt":"2026-04-15T12:17:21.029Z"} +{"sweepId":"smoke-todo","cellId":"9dae0af9aee37f10","runId":"smoke-todo:9dae0af9aee37f10:0","configHash":"9dae0af9aee37f10","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T12:17:21.030Z","durationMs":48185.751083,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":4595,"outputTokens":448,"totalTokens":5043,"maxContextTokens":2375,"timeToFirstToolCallMs":30390.164292,"setupTimeMs":2.1136669999999924,"llmTimeMs":48174.050042999996,"totalTimeMs":48185.067708,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2220,"outputTokens":275,"latencyMs":30390.157584,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":2375,"outputTokens":173,"latencyMs":17783.892459,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} +{"sweepId":"smoke-todo","cellId":"5897a9956995fa0d","runId":"smoke-todo:5897a9956995fa0d:0","configHash":"5897a9956995fa0d","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T12:18:09.216Z","durationMs":34703.892374999996,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":2611,"outputTokens":337,"totalTokens":2948,"maxContextTokens":1151,"timeToFirstToolCallMs":9442.800042000003,"setupTimeMs":81.44879199999559,"llmTimeMs":34596.018457999984,"totalTimeMs":34702.949875,"transportBytesSent":324,"transportBytesReceived":1102,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":549,"outputTokens":90,"latencyMs":9442.779665999995,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":911,"outputTokens":241,"latencyMs":24044.363457999993,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":2,"inputTokens":1151,"outputTokens":6,"latencyMs":1108.8753339999967,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}} diff --git a/benchmarks/v2/results/smoke/aggregated.json b/benchmarks/v2/results/smoke/aggregated.json new file mode 100644 index 0000000..617b061 --- /dev/null +++ b/benchmarks/v2/results/smoke/aggregated.json @@ -0,0 +1,288 @@ +{ + "source": "results/smoke/runs.jsonl", + "runs": 6, + "cells": [ + { + "cellId": "785d02dabebe52d5", + "cell": { + "provider": { + "kind": "openai-compat", + "baseUrl": "http://slopinator-s-1.local:11434/v1", + "model": "gemma4:31b" + }, + "prompt": "spec", + "encoding": "indented-text", + "optimization": "off", + "protocol": "slop", + "app": "issue-tracker", + "scale": "s", + "scenario": "explore-and-act", + "seed": 42, + "iteration": 0 + }, + "runs": 3, + "passRate": 1, + "failureCategories": { + "ok": 3, + "no_verifier": 0, + "verify_fail": 0, + "max_turns": 0, + "tool_unknown": 0, + "tool_invoke_error": 0, + "tool_param_error": 0, + "cell_exception": 0 + }, + "totalTokens": { + "count": 3, + "mean": 10101.666666666666, + "median": 10134, + "p95": 10135.8, + "stdev": 57.74368652357877, + "min": 10035, + "max": 10136 + }, + "inputTokens": { + "count": 3, + "mean": 9603, + "median": 9603, + "p95": 9603, + "stdev": 0, + "min": 9603, + "max": 9603 + }, + "outputTokens": { + "count": 3, + "mean": 498.6666666666667, + "median": 531, + "p95": 532.8, + "stdev": 57.74368652357878, + "min": 432, + "max": 533 + }, + "maxContextTokens": { + "count": 3, + "mean": 4895, + "median": 4895, + "p95": 4895, + "stdev": 0, + "min": 4895, + "max": 4895 + }, + "turns": { + "count": 3, + "mean": 2, + "median": 2, + "p95": 2, + "stdev": 0, + "min": 2, + "max": 2 + }, + "toolCalls": { + "count": 3, + "mean": 2, + "median": 2, + "p95": 2, + "stdev": 0, + "min": 2, + "max": 2 + }, + "specComplianceRate": { + "count": 3, + "mean": 1, + "median": 1, + "p95": 1, + "stdev": 0, + "min": 1, + "max": 1 + }, + "llmTimeMs": { + "count": 3, + "mean": 59255.96312499999, + "median": 58775.35249999999, + "p95": 60187.3716125, + "stdev": 944.6343587235024, + "min": 58648.274249999995, + "max": 60344.262625 + }, + "totalTimeMs": { + "count": 3, + "mean": 59271.172083666665, + "median": 58788.625958000004, + "p95": 60204.2368589, + "stdev": 946.3498279522337, + "min": 58663.363333999994, + "max": 60361.526959 + }, + "timeToFirstToolCallMs": { + "count": 3, + "mean": 41278.56054200001, + "median": 40849.855666999996, + "p95": 42089.443066700005, + "stdev": 822.7889262352226, + "min": 40758.65095900002, + "max": 42227.175 + }, + "transportBytes": { + "count": 3, + "mean": 725, + "median": 725, + "p95": 725, + "stdev": 0, + "min": 725, + "max": 725 + }, + "costUsd": { + "count": 3, + "mean": 0, + "median": 0, + "p95": 0, + "stdev": 0, + "min": 0, + "max": 0 + }, + "costPerSuccess": 0, + "tokensPerSuccess": 10101.666666666666 + }, + { + "cellId": "fbc384d4e3ccd4ae", + "cell": { + "provider": { + "kind": "openai-compat", + "baseUrl": "http://slopinator-s-1.local:11434/v1", + "model": "gemma4:31b" + }, + "prompt": "spec", + "encoding": "indented-text", + "optimization": "combined", + "protocol": "slop", + "app": "issue-tracker", + "scale": "s", + "scenario": "explore-and-act", + "seed": 42, + "iteration": 0 + }, + "runs": 3, + "passRate": 1, + "failureCategories": { + "ok": 3, + "no_verifier": 0, + "verify_fail": 0, + "max_turns": 0, + "tool_unknown": 0, + "tool_invoke_error": 0, + "tool_param_error": 0, + "cell_exception": 0 + }, + "totalTokens": { + "count": 3, + "mean": 8612, + "median": 8612, + "p95": 8612, + "stdev": 0, + "min": 8612, + "max": 8612 + }, + "inputTokens": { + "count": 3, + "mean": 8115, + "median": 8115, + "p95": 8115, + "stdev": 0, + "min": 8115, + "max": 8115 + }, + "outputTokens": { + "count": 3, + "mean": 497, + "median": 497, + "p95": 497, + "stdev": 0, + "min": 497, + "max": 497 + }, + "maxContextTokens": { + "count": 3, + "mean": 4151, + "median": 4151, + "p95": 4151, + "stdev": 0, + "min": 4151, + "max": 4151 + }, + "turns": { + "count": 3, + "mean": 2, + "median": 2, + "p95": 2, + "stdev": 0, + "min": 2, + "max": 2 + }, + "toolCalls": { + "count": 3, + "mean": 2, + "median": 2, + "p95": 2, + "stdev": 0, + "min": 2, + "max": 2 + }, + "specComplianceRate": { + "count": 3, + "mean": 1, + "median": 1, + "p95": 1, + "stdev": 0, + "min": 1, + "max": 1 + }, + "llmTimeMs": { + "count": 3, + "mean": 55551.42805566667, + "median": 54098.86620800002, + "p95": 58073.06235890001, + "stdev": 2566.380920587207, + "min": 54040.778249999974, + "max": 58514.63970900001 + }, + "totalTimeMs": { + "count": 3, + "mean": 55564.67487466668, + "median": 54111.153208, + "p95": 58087.66363330001, + "stdev": 2567.7747776995475, + "min": 54053.373291000025, + "max": 58529.49812500001 + }, + "timeToFirstToolCallMs": { + "count": 3, + "mean": 32601.769805333333, + "median": 31149.366374999983, + "p95": 35087.656049400015, + "stdev": 2531.8199430530817, + "min": 31130.699250000005, + "max": 35525.243791000015 + }, + "transportBytes": { + "count": 3, + "mean": 725, + "median": 725, + "p95": 725, + "stdev": 0, + "min": 725, + "max": 725 + }, + "costUsd": { + "count": 3, + "mean": 0, + "median": 0, + "p95": 0, + "stdev": 0, + "min": 0, + "max": 0 + }, + "costPerSuccess": 0, + "tokensPerSuccess": 8612 + } + ] +} \ No newline at end of file diff --git a/benchmarks/v2/results/smoke/runs.jsonl b/benchmarks/v2/results/smoke/runs.jsonl new file mode 100644 index 0000000..6b2ea4b --- /dev/null +++ b/benchmarks/v2/results/smoke/runs.jsonl @@ -0,0 +1,7 @@ +{"type":"sweep","config":{"id":"smoke","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off","combined"],"protocols":["slop"],"apps":["issue-tracker"],"dataScales":["s"],"scenarioFilter":["explore-and-act"],"seeds":[42],"iterations":3,"maxConcurrency":1,"maxTurns":20,"temperature":0},"startedAt":"2026-04-15T11:55:46.162Z"} +{"sweepId":"smoke","cellId":"785d02dabebe52d5","runId":"smoke:785d02dabebe52d5:0","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T11:55:46.162Z","durationMs":60362.468917,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":432,"totalTokens":10035,"maxContextTokens":4895,"timeToFirstToolCallMs":42227.175,"setupTimeMs":3.0109580000000022,"llmTimeMs":60344.262625,"totalTimeMs":60361.526959,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":271,"latencyMs":42227.166667000005,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":161,"latencyMs":18117.095957999998,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} +{"sweepId":"smoke","cellId":"785d02dabebe52d5","runId":"smoke:785d02dabebe52d5:1","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":1},"startedAt":"2026-04-15T11:56:46.525Z","durationMs":58663.499667000004,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":531,"totalTokens":10134,"maxContextTokens":4895,"timeToFirstToolCallMs":40849.855666999996,"setupTimeMs":2.284749999998894,"llmTimeMs":58648.274249999995,"totalTimeMs":58663.363333999994,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":372,"latencyMs":40849.844791999996,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":159,"latencyMs":17798.429458,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} +{"sweepId":"smoke","cellId":"785d02dabebe52d5","runId":"smoke:785d02dabebe52d5:2","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":2},"startedAt":"2026-04-15T11:57:45.189Z","durationMs":58788.669958,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":533,"totalTokens":10136,"maxContextTokens":4895,"timeToFirstToolCallMs":40758.65095900002,"setupTimeMs":2.3049170000012964,"llmTimeMs":58775.35249999999,"totalTimeMs":58788.625958000004,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":372,"latencyMs":40758.644125000006,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":161,"latencyMs":18016.708374999987,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} +{"sweepId":"smoke","cellId":"fbc384d4e3ccd4ae","runId":"smoke:fbc384d4e3ccd4ae:0","configHash":"fbc384d4e3ccd4ae","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"combined","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T11:58:43.978Z","durationMs":58529.54658299999,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8115,"outputTokens":497,"totalTokens":8612,"maxContextTokens":4151,"timeToFirstToolCallMs":35525.243791000015,"setupTimeMs":1.9764159999904223,"llmTimeMs":58514.63970900001,"totalTimeMs":58529.49812500001,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3964,"outputTokens":288,"latencyMs":35525.239417000004,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4151,"outputTokens":209,"latencyMs":22989.400292000006,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} +{"sweepId":"smoke","cellId":"fbc384d4e3ccd4ae","runId":"smoke:fbc384d4e3ccd4ae:1","configHash":"fbc384d4e3ccd4ae","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"combined","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":1},"startedAt":"2026-04-15T11:59:42.508Z","durationMs":54111.23016699997,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8115,"outputTokens":497,"totalTokens":8612,"maxContextTokens":4151,"timeToFirstToolCallMs":31149.366374999983,"setupTimeMs":2.1295000000100117,"llmTimeMs":54098.86620800002,"totalTimeMs":54111.153208,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3964,"outputTokens":288,"latencyMs":31149.329875000025,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4151,"outputTokens":209,"latencyMs":22949.536332999996,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} +{"sweepId":"smoke","cellId":"fbc384d4e3ccd4ae","runId":"smoke:fbc384d4e3ccd4ae:2","configHash":"fbc384d4e3ccd4ae","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"combined","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":2},"startedAt":"2026-04-15T12:00:36.619Z","durationMs":54053.455665999965,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8115,"outputTokens":497,"totalTokens":8612,"maxContextTokens":4151,"timeToFirstToolCallMs":31130.699250000005,"setupTimeMs":2.4627910000272095,"llmTimeMs":54040.778249999974,"totalTimeMs":54053.373291000025,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3964,"outputTokens":288,"latencyMs":31130.65529199998,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4151,"outputTokens":209,"latencyMs":22910.122957999993,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}} diff --git a/benchmarks/v2/run.ts b/benchmarks/v2/run.ts new file mode 100644 index 0000000..2653509 --- /dev/null +++ b/benchmarks/v2/run.ts @@ -0,0 +1,38 @@ +import { parseArgs } from "node:util"; +import { runSweep } from "./runner/sweep.ts"; +import type { SweepConfig } from "./runner/types.ts"; + +const { values } = parseArgs({ + options: { + config: { type: "string", default: "smoke" }, + "dry-run": { type: "boolean", default: false }, + fresh: { type: "boolean", default: false }, + id: { type: "string" }, + }, +}); + +const configName = values.config!; +const mod = await import(`./config/${configName}.ts`); +const camel = configName.replace(/-([a-z])/g, (_, c: string) => c.toUpperCase()); +const sweepName = `${camel}Sweep`; +const sweep: SweepConfig | undefined = mod[sweepName] ?? mod.default; +if (!sweep) { + console.error(`config/${configName}.ts must export \`${sweepName}\` or a default SweepConfig`); + process.exit(1); +} + +if (values.id) sweep.id = values.id; + +console.log(`[run] sweep=${sweep.id} config=${configName}`); +console.log( + `[run] providers=${sweep.providers.map((p) => `${p.kind}:${p.model}`).join(",")} ` + + `prompts=${sweep.promptVariants.join(",")} ` + + `encodings=${sweep.encodingVariants.join(",")} ` + + `optimizations=${sweep.optimizationVariants.join(",")} ` + + `protocols=${sweep.protocols.join(",")} ` + + `apps=${sweep.apps.join(",")} ` + + `scales=${sweep.dataScales.join(",")} ` + + `iterations=${sweep.iterations}`, +); + +await runSweep(sweep, { dryRun: values["dry-run"], fresh: values.fresh }); diff --git a/benchmarks/v2/runner/hash.ts b/benchmarks/v2/runner/hash.ts new file mode 100644 index 0000000..3d3f49c --- /dev/null +++ b/benchmarks/v2/runner/hash.ts @@ -0,0 +1,66 @@ +import { createHash } from "node:crypto"; +import type { Cell, SweepConfig } from "./types.ts"; + +/** + * Canonicalize a value into a deterministic JSON string: object keys sorted, + * arrays preserved in order, primitives as-is. Two cells that should hash to + * the same value must stringify identically. + */ +export function canonicalize(value: unknown): string { + if (value === null || value === undefined) return "null"; + if (typeof value !== "object") return JSON.stringify(value); + if (Array.isArray(value)) { + return `[${value.map(canonicalize).join(",")}]`; + } + const obj = value as Record; + const keys = Object.keys(obj).sort(); + return `{${keys.map((k) => `${JSON.stringify(k)}:${canonicalize(obj[k])}`).join(",")}}`; +} + +function sha256Hex(s: string): string { + return createHash("sha256").update(s).digest("hex"); +} + +/** + * Deterministic ID for a cell, independent of iteration index. Two cells with + * the same configHash should produce identical runs (given a fixed seed). + */ +export function configHash(sweep: SweepConfig, cell: Cell): string { + const snapshot = { + sweep: { + maxTurns: sweep.maxTurns, + temperature: sweep.temperature, + }, + cell: { + provider: cell.provider, + prompt: cell.prompt, + encoding: cell.encoding, + optimization: cell.optimization, + protocol: cell.protocol, + mcpVariant: cell.mcpVariant ?? null, + app: cell.app, + scale: cell.scale, + scenario: cell.scenario, + seed: cell.seed, + }, + }; + return sha256Hex(canonicalize(snapshot)).slice(0, 16); +} + +export function cellLabel(cell: Cell): string { + const parts = [ + cell.app, + cell.scale, + cell.scenario, + cell.protocol, + cell.protocol === "mcp" ? (cell.mcpVariant ?? "flat") : `${cell.prompt}/${cell.encoding}/${cell.optimization}`, + `${cell.provider.kind}:${cell.provider.model}`, + `seed=${cell.seed}`, + `iter=${cell.iteration}`, + ]; + return parts.join(" | "); +} + +export function runId(sweepId: string, cell: Cell, cfgHash: string): string { + return `${sweepId}:${cfgHash}:${cell.iteration}`; +} diff --git a/benchmarks/v2/runner/jsonl.ts b/benchmarks/v2/runner/jsonl.ts new file mode 100644 index 0000000..6ff7626 --- /dev/null +++ b/benchmarks/v2/runner/jsonl.ts @@ -0,0 +1,32 @@ +import { mkdirSync, createWriteStream, type WriteStream } from "node:fs"; +import { dirname } from "node:path"; + +export interface JsonlWriterOpts { + append?: boolean; +} + +export class JsonlWriter { + private stream: WriteStream | null = null; + + constructor(private readonly path: string, private readonly opts: JsonlWriterOpts = {}) {} + + open() { + mkdirSync(dirname(this.path), { recursive: true }); + // Default: truncate on open so re-running a sweep with the same id starts + // fresh. Pass {append: true} to accumulate across runs. + this.stream = createWriteStream(this.path, { flags: this.opts.append ? "a" : "w" }); + } + + write(record: unknown) { + if (!this.stream) throw new Error("JsonlWriter not opened"); + this.stream.write(`${JSON.stringify(record)}\n`); + } + + async close() { + if (!this.stream) return; + await new Promise((resolve, reject) => { + this.stream!.end((err?: Error | null) => (err ? reject(err) : resolve())); + }); + this.stream = null; + } +} diff --git a/benchmarks/v2/runner/mcp-cell.ts b/benchmarks/v2/runner/mcp-cell.ts new file mode 100644 index 0000000..269de39 --- /dev/null +++ b/benchmarks/v2/runner/mcp-cell.ts @@ -0,0 +1,196 @@ +import { resolveApp } from "../apps/registry.ts"; +import { resolveMcpVariant } from "../variants/mcp-variants.ts"; +import type { LlmProvider, ChatMessage, ToolDef } from "../providers/types.ts"; +import type { Cell, CellMetrics, SweepConfig, TurnMetric } from "./types.ts"; + +interface RunMcpCellArgs { + cell: Cell; + sweep: SweepConfig; + provider: LlmProvider; +} + +export async function runMcpCell({ cell, sweep, provider }: RunMcpCellArgs): Promise { + const app = resolveApp(cell.app); + if (!app.startMcpServer || !app.mcpSystemPrompt) { + throw new Error(`App ${cell.app} does not expose an MCP server`); + } + const variant = cell.mcpVariant ?? "flat"; + const scenario = app.scenarios.find((s) => s.name === cell.scenario); + if (!scenario) throw new Error(`Scenario "${cell.scenario}" not found on app ${cell.app}`); + + const t0 = performance.now(); + + let transportBytesSent = 0; + let transportBytesReceived = 0; + let inputTokens = 0; + let outputTokens = 0; + let maxContextTokens = 0; + let turns = 0; + let totalToolCalls = 0; + let affordanceToolCalls = 0; + let unknownToolCalls = 0; + let invokeErrorCalls = 0; + let paramErrorCalls = 0; + let llmTimeMs = 0; + let setupTimeMs = 0; + let timeToFirstToolCallMs: number | null = null; + let finishReason: CellMetrics["finishReason"] = "done"; + const turnBreakdown: TurnMetric[] = []; + + const tSetup = performance.now(); + const handle = await app.startMcpServer(cell.scale, variant); + let verification: Awaited>> | undefined; + try { + const listed = await handle.client.listTools(); + setupTimeMs = performance.now() - tSetup; + + const mcpToolNames = new Set(listed.tools.map((t) => t.name)); + const tools: ToolDef[] = listed.tools.map((t) => ({ + name: t.name, + description: t.description ?? "", + parameters: (t.inputSchema as Record) ?? { type: "object", properties: {} }, + })); + + const buildMcpPrompt = resolveMcpVariant(variant); + const systemPrompt = buildMcpPrompt(app.mcpSystemPrompt); + + const history: ChatMessage[] = [{ role: "user", content: scenario.agentPrompt }]; + const tAgentStart = performance.now(); + + while (turns < sweep.maxTurns) { + turns += 1; + const turnIndex = turns - 1; + const tGen = performance.now(); + const res = await provider.generate({ + systemPrompt, + messages: history, + tools, + temperature: sweep.temperature, + }); + const turnLatency = performance.now() - tGen; + llmTimeMs += turnLatency; + inputTokens += res.usage.inputTokens; + outputTokens += res.usage.outputTokens; + if (res.usage.inputTokens > maxContextTokens) maxContextTokens = res.usage.inputTokens; + + history.push(res.message); + const calls = res.message.toolCalls ?? []; + const turn: TurnMetric = { + index: turnIndex, + inputTokens: res.usage.inputTokens, + outputTokens: res.usage.outputTokens, + latencyMs: turnLatency, + toolCalls: calls.length, + toolCallKinds: [], + }; + + if (calls.length === 0) { + turnBreakdown.push(turn); + finishReason = "done"; + break; + } + + if (timeToFirstToolCallMs === null) timeToFirstToolCallMs = performance.now() - tAgentStart; + + for (const call of calls) { + totalToolCalls += 1; + if (!mcpToolNames.has(call.name)) { + unknownToolCalls += 1; + turn.toolCallKinds.push("unknown"); + history.push({ + role: "tool", + content: JSON.stringify({ error: `unknown tool: ${call.name}` }), + toolCallId: call.id, + name: call.name, + }); + continue; + } + try { + const result = await handle.client.callTool({ name: call.name, arguments: call.arguments }); + const sent = JSON.stringify({ name: call.name, arguments: call.arguments }).length; + const content = (result as { content?: Array<{ type: string; text?: string }>; isError?: boolean }).content ?? []; + const resultText = content + .filter((c) => c.type === "text") + .map((c) => c.text ?? "") + .join(""); + const isError = (result as { isError?: boolean }).isError === true; + if (isError) { + invokeErrorCalls += 1; + turn.toolCallKinds.push("invoke_error"); + } else { + affordanceToolCalls += 1; + turn.toolCallKinds.push("affordance"); + } + if (process.env.BENCH_DEBUG) { + console.error( + `[mcp-cell] ${isError ? "ERR " : ""}${call.name}(${JSON.stringify(call.arguments).slice(0, 200)}) → ${resultText.slice(0, 200)}`, + ); + } + transportBytesSent += sent; + transportBytesReceived += resultText.length; + history.push({ + role: "tool", + content: resultText || JSON.stringify({ status: "ok" }), + toolCallId: call.id, + name: call.name, + }); + } catch (err) { + invokeErrorCalls += 1; + turn.toolCallKinds.push("invoke_error"); + history.push({ + role: "tool", + content: JSON.stringify({ error: `invoke failed: ${err instanceof Error ? err.message : String(err)}` }), + toolCallId: call.id, + name: call.name, + }); + } + } + + turnBreakdown.push(turn); + } + + if (turns >= sweep.maxTurns && (history[history.length - 1]?.toolCalls?.length ?? 0) > 0) { + finishReason = "max_turns"; + } + + // Verification must run while the MCP server is still alive (it rebuilds state via tool calls). + verification = await handle.verify(scenario); + } finally { + await handle.stop(); + } + + const totalTimeMs = performance.now() - t0; + const attemptedCalls = affordanceToolCalls + unknownToolCalls + paramErrorCalls + invokeErrorCalls; + const specComplianceRate = attemptedCalls > 0 ? affordanceToolCalls / attemptedCalls : 1; + + return { + turns, + toolCalls: totalToolCalls, + navigationToolCalls: 0, + affordanceToolCalls, + unknownToolCalls, + invokeErrorCalls, + paramErrorCalls, + inputTokens, + outputTokens, + totalTokens: inputTokens + outputTokens, + maxContextTokens, + timeToFirstToolCallMs, + setupTimeMs, + llmTimeMs, + totalTimeMs, + transportBytesSent, + transportBytesReceived, + specComplianceRate, + finishReason, + turnBreakdown, + verification: verification + ? { + passed: verification.passed, + totalChecks: verification.checks.length, + passedChecks: verification.checks.filter((c) => c.passed).length, + failures: verification.checks.filter((c) => !c.passed).map((c) => `${c.name}${c.detail ? `: ${c.detail}` : ""}`), + } + : undefined, + }; +} diff --git a/benchmarks/v2/runner/slop-cell.ts b/benchmarks/v2/runner/slop-cell.ts new file mode 100644 index 0000000..36bd1fa --- /dev/null +++ b/benchmarks/v2/runner/slop-cell.ts @@ -0,0 +1,299 @@ +import { SlopConsumer, WebSocketClientTransport, affordancesToTools } from "@slop-ai/consumer"; +import type { SlopNode } from "@slop-ai/consumer"; +import { resolveApp } from "../apps/registry.ts"; +import { resolveEncoding } from "../variants/encodings.ts"; +import { resolveOptimization } from "../variants/optimizations.ts"; +import { resolvePrompt } from "../variants/prompts.ts"; +import type { LlmProvider, ChatMessage, ToolDef } from "../providers/types.ts"; +import type { Cell, CellMetrics, SweepConfig, TurnMetric } from "./types.ts"; + +interface RunSlopCellArgs { + cell: Cell; + sweep: SweepConfig; + provider: LlmProvider; + port: number; +} + +export async function runSlopCell({ cell, sweep, provider, port }: RunSlopCellArgs): Promise { + const app = resolveApp(cell.app); + const optimization = resolveOptimization(cell.optimization); + const encode = resolveEncoding(cell.encoding); + const buildPrompt = resolvePrompt(cell.prompt); + + const scenario = app.scenarios.find((s) => s.name === cell.scenario); + if (!scenario) throw new Error(`Scenario "${cell.scenario}" not found on app ${cell.app}`); + + const t0 = performance.now(); + const store = app.createStore(cell.scale, cell.seed); + const server = await app.startSlopServer(store, port, optimization.serverOpts); + + let transportBytesSent = 0; + let transportBytesReceived = 0; + let inputTokens = 0; + let outputTokens = 0; + let maxContextTokens = 0; + let turns = 0; + let totalToolCalls = 0; + let navigationToolCalls = 0; + let affordanceToolCalls = 0; + let unknownToolCalls = 0; + let invokeErrorCalls = 0; + let paramErrorCalls = 0; + let llmTimeMs = 0; + let setupTimeMs = 0; + let timeToFirstToolCallMs: number | null = null; + let finishReason: CellMetrics["finishReason"] = "done"; + const turnBreakdown: TurnMetric[] = []; + + try { + const tSetup = performance.now(); + const transport = new WebSocketClientTransport(server.wsUrl); + const consumer = new SlopConsumer(transport); + await consumer.connect(); + const { id: subId, snapshot } = await consumer.subscribe("/", -1); + setupTimeMs = performance.now() - tSetup; + + let toolSet = affordancesToTools(snapshot); + const initialStateText = encode(snapshot); + const systemPrompt = buildPrompt(initialStateText); + + const navigationTools: ToolDef[] = [ + { + name: "slop_query", + description: + "Load the full subtree at a given path. Use this to expand windowed collections, load lazy children, or resolve stub nodes. Returns the subtree with all properties, children, and affordances.", + parameters: { + type: "object", + properties: { + path: { type: "string", description: "Tree path to load" }, + depth: { type: "integer", description: "Resolution depth; -1 for full. Default: -1" }, + }, + required: ["path"], + }, + }, + { + name: "slop_get_state", + description: "Return the current full state tree.", + parameters: { type: "object", properties: {} }, + }, + ]; + + const buildTools = (): ToolDef[] => [ + ...toolSet.tools.map((t) => ({ + name: t.function.name, + description: t.function.description, + parameters: t.function.parameters as Record, + })), + ...navigationTools, + ]; + + const history: ChatMessage[] = [{ role: "user", content: scenario.agentPrompt }]; + const tAgentStart = performance.now(); + + while (turns < sweep.maxTurns) { + turns += 1; + const turnIndex = turns - 1; + const tGen = performance.now(); + const res = await provider.generate({ + systemPrompt, + messages: history, + tools: buildTools(), + temperature: sweep.temperature, + }); + const turnLatency = performance.now() - tGen; + llmTimeMs += turnLatency; + inputTokens += res.usage.inputTokens; + outputTokens += res.usage.outputTokens; + if (res.usage.inputTokens > maxContextTokens) maxContextTokens = res.usage.inputTokens; + + history.push(res.message); + const calls = res.message.toolCalls ?? []; + const turn: TurnMetric = { + index: turnIndex, + inputTokens: res.usage.inputTokens, + outputTokens: res.usage.outputTokens, + latencyMs: turnLatency, + toolCalls: calls.length, + toolCallKinds: [], + }; + + if (calls.length === 0) { + turnBreakdown.push(turn); + finishReason = "done"; + break; + } + + if (timeToFirstToolCallMs === null) timeToFirstToolCallMs = performance.now() - tAgentStart; + + let treeChanged = false; + + for (const call of calls) { + totalToolCalls += 1; + + if (call.name === "slop_query") { + navigationToolCalls += 1; + turn.toolCallKinds.push("slop_query"); + const path = String(call.arguments.path ?? "/"); + const depth = Number.isFinite(call.arguments.depth) ? Number(call.arguments.depth) : -1; + const subtree = await consumer.query(path, depth); + transportBytesSent += JSON.stringify({ type: "query", path, depth }).length; + transportBytesReceived += JSON.stringify(subtree).length; + const subtreeText = encode(subtree as SlopNode); + history.push({ + role: "tool", + content: JSON.stringify({ path, tree: subtreeText }), + toolCallId: call.id, + name: call.name, + }); + mergeDiscoveredAffordances(toolSet, subtree as SlopNode, path); + treeChanged = true; + continue; + } + + if (call.name === "slop_get_state") { + navigationToolCalls += 1; + turn.toolCallKinds.push("slop_get_state"); + const currentTree = consumer.getTree(subId); + const text = currentTree ? encode(currentTree) : "No state available"; + transportBytesReceived += text.length; + history.push({ + role: "tool", + content: JSON.stringify({ tree: text }), + toolCallId: call.id, + name: call.name, + }); + continue; + } + + const resolved = toolSet.resolve(call.name); + if (!resolved) { + unknownToolCalls += 1; + turn.toolCallKinds.push("unknown"); + history.push({ + role: "tool", + content: JSON.stringify({ error: `unknown tool: ${call.name}` }), + toolCallId: call.id, + name: call.name, + }); + continue; + } + + const invokePath = resolvePath(resolved, call.arguments); + if (!invokePath) { + paramErrorCalls += 1; + turn.toolCallKinds.push("param_error"); + history.push({ + role: "tool", + content: JSON.stringify({ error: "missing target for grouped affordance" }), + toolCallId: call.id, + name: call.name, + }); + continue; + } + + try { + const result = await consumer.invoke(invokePath, resolved.action, call.arguments); + affordanceToolCalls += 1; + turn.toolCallKinds.push("affordance"); + transportBytesSent += JSON.stringify({ path: invokePath, action: resolved.action, params: call.arguments }).length; + transportBytesReceived += JSON.stringify(result).length; + history.push({ + role: "tool", + content: JSON.stringify(result.data ?? { status: result.status }), + toolCallId: call.id, + name: call.name, + }); + treeChanged = true; + } catch (err) { + invokeErrorCalls += 1; + turn.toolCallKinds.push("invoke_error"); + history.push({ + role: "tool", + content: JSON.stringify({ error: `invoke failed: ${err instanceof Error ? err.message : String(err)}` }), + toolCallId: call.id, + name: call.name, + }); + } + } + + turnBreakdown.push(turn); + + if (treeChanged) { + const updated = consumer.getTree(subId); + if (updated) toolSet = affordancesToTools(updated); + } + } + + if (turns >= sweep.maxTurns && (history[history.length - 1]?.toolCalls?.length ?? 0) > 0) { + finishReason = "max_turns"; + } + + consumer.disconnect(); + } finally { + await server.stop(); + } + + const verification = app.verify(store, scenario); + const totalTimeMs = performance.now() - t0; + const attemptedCalls = affordanceToolCalls + unknownToolCalls + paramErrorCalls + invokeErrorCalls; + const specComplianceRate = attemptedCalls > 0 ? affordanceToolCalls / attemptedCalls : 1; + + return { + turns, + toolCalls: totalToolCalls, + navigationToolCalls, + affordanceToolCalls, + unknownToolCalls, + invokeErrorCalls, + paramErrorCalls, + inputTokens, + outputTokens, + totalTokens: inputTokens + outputTokens, + maxContextTokens, + timeToFirstToolCallMs, + setupTimeMs, + llmTimeMs, + totalTimeMs, + transportBytesSent, + transportBytesReceived, + specComplianceRate, + finishReason, + turnBreakdown, + verification: verification + ? { + passed: verification.passed, + totalChecks: verification.checks.length, + passedChecks: verification.checks.filter((c) => c.passed).length, + failures: verification.checks.filter((c) => !c.passed).map((c) => `${c.name}${c.detail ? `: ${c.detail}` : ""}`), + } + : undefined, + }; +} + +function resolvePath( + resolved: { path: string | null; action: string; targets?: string[] }, + args: Record, +): string | null { + if (resolved.path) return resolved.path; + const target = args.target; + if (typeof target === "string" && resolved.targets && resolved.targets.includes(target)) { + return target; + } + return null; +} + +function mergeDiscoveredAffordances( + existing: ReturnType, + subtree: SlopNode, + subtreePath: string, +) { + const subtreeTools = affordancesToTools(subtree, subtreePath); + const existingResolve = existing.resolve.bind(existing); + const subtreeResolve = subtreeTools.resolve.bind(subtreeTools); + for (const tool of subtreeTools.tools) { + if (!existing.tools.find((t) => t.function.name === tool.function.name)) { + existing.tools.push(tool); + } + } + existing.resolve = (name: string) => existingResolve(name) ?? subtreeResolve(name); +} diff --git a/benchmarks/v2/runner/sweep.ts b/benchmarks/v2/runner/sweep.ts new file mode 100644 index 0000000..d770c06 --- /dev/null +++ b/benchmarks/v2/runner/sweep.ts @@ -0,0 +1,231 @@ +import { existsSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import type { LlmProvider } from "../providers/types.ts"; +import { OpenAICompatProvider } from "../providers/openai-compat.ts"; +import { resolveApp } from "../apps/registry.ts"; +import { runSlopCell } from "./slop-cell.ts"; +import { runMcpCell } from "./mcp-cell.ts"; +import { cellLabel, configHash, runId } from "./hash.ts"; +import { JsonlWriter } from "./jsonl.ts"; +import type { Cell, ProviderConfig, RunRecord, SweepConfig } from "./types.ts"; + +const BASE_PORT = 4198; + +export interface SweepRunOptions { + resultsRoot?: string; + dryRun?: boolean; + /** Truncate existing runs.jsonl and start over. Default: resume if data exists. */ + fresh?: boolean; + onRecord?: (record: RunRecord) => void; +} + +export async function runSweep(sweep: SweepConfig, opts: SweepRunOptions = {}) { + const resultsRoot = opts.resultsRoot ?? join(import.meta.dir, "..", "results"); + const outDir = join(resultsRoot, sweep.id); + const jsonlPath = join(outDir, "runs.jsonl"); + + // Resume: scan existing runs.jsonl (if any) and collect completed run IDs. + // A cell is considered "done" when a record with its runId and no `error` + // field is present. Errored cells are retried on resume. + const completedRunIds = new Set(); + let appending = false; + if (!opts.fresh && existsSync(jsonlPath)) { + try { + const raw = readFileSync(jsonlPath, "utf8"); + for (const line of raw.split("\n")) { + if (!line.trim()) continue; + try { + const rec = JSON.parse(line) as Partial & { runId?: string; error?: string }; + if (rec.runId && !rec.error && rec.metrics) completedRunIds.add(rec.runId); + } catch { + // Ignore malformed lines + } + } + if (completedRunIds.size > 0) { + appending = true; + console.log(`[sweep] resume: ${completedRunIds.size} completed runs already recorded, appending`); + } + } catch (err) { + console.warn(`[sweep] resume: failed to read ${jsonlPath}: ${err}`); + } + } + + const writer = new JsonlWriter(jsonlPath, { append: appending }); + writer.open(); + if (!appending) { + writer.write({ type: "sweep", config: sweep, startedAt: new Date().toISOString() }); + } + + const cells = expand(sweep); + console.log(`[sweep] ${sweep.id}: ${cells.length} cells`); + + if (opts.dryRun) { + for (const cell of cells) { + const h = configHash(sweep, cell); + console.log(`[dry] ${h} ${cellLabel(cell)}`); + } + await writer.close(); + return { cells, recorded: 0 }; + } + + const providerCache = new Map(); + let done = 0; + let skipped = 0; + let portCursor = BASE_PORT; + + for (const cell of cells) { + const h = configHash(sweep, cell); + const id = runId(sweep.id, cell, h); + if (completedRunIds.has(id)) { + skipped += 1; + done += 1; + console.log(`[${done}/${cells.length}] SKIP ${h} ${cellLabel(cell)}`); + continue; + } + const startedAt = new Date().toISOString(); + const t0 = performance.now(); + + let record: RunRecord = { + sweepId: sweep.id, + cellId: h, + runId: id, + configHash: h, + cell, + startedAt, + durationMs: 0, + }; + + try { + const provider = getOrCreateProvider(providerCache, cell.provider); + const port = portCursor++; + + if (cell.protocol === "slop") { + const metrics = await runSlopCell({ cell, sweep, provider, port }); + record.metrics = metrics; + } else if (cell.protocol === "mcp") { + const metrics = await runMcpCell({ cell, sweep, provider }); + record.metrics = metrics; + } else { + throw new Error(`Unknown protocol: ${cell.protocol}`); + } + } catch (err) { + record.error = err instanceof Error ? `${err.message}\n${err.stack ?? ""}` : String(err); + } finally { + record.durationMs = performance.now() - t0; + } + + writer.write(record); + opts.onRecord?.(record); + done += 1; + + const status = record.error + ? "ERR" + : record.metrics?.verification + ? record.metrics.verification.passed + ? "PASS" + : "FAIL" + : "—"; + console.log( + `[${done}/${cells.length}] ${status} ${h} t=${record.durationMs.toFixed(0)}ms ${cellLabel(cell)}`, + ); + } + + await writer.close(); + if (skipped > 0) console.log(`[sweep] done: ${done - skipped} ran, ${skipped} resumed`); + return { cells, recorded: done, skipped }; +} + +function expand(sweep: SweepConfig): Cell[] { + const cells: Cell[] = []; + const appFilter = sweep.apps; + for (const appId of appFilter) { + const app = resolveApp(appId); + const scales = sweep.dataScales.filter((s) => app.supportedScales.includes(s)); + if (scales.length === 0) { + console.warn(`[sweep] app ${appId}: no supported scales in ${JSON.stringify(sweep.dataScales)} (supported: ${app.supportedScales.join(", ")})`); + continue; + } + const scenarios = (sweep.scenarioFilter && sweep.scenarioFilter.length > 0 + ? app.scenarios.filter((s) => sweep.scenarioFilter!.includes(s.name)) + : app.scenarios); + if (scenarios.length === 0) { + console.warn(`[sweep] app ${appId}: no matching scenarios`); + continue; + } + + for (const provider of sweep.providers) { + for (const scale of scales) { + for (const scenario of scenarios) { + for (const seed of sweep.seeds) { + for (const protocol of sweep.protocols) { + if (protocol === "slop") { + for (const prompt of sweep.promptVariants) { + for (const encoding of sweep.encodingVariants) { + for (const optimization of sweep.optimizationVariants) { + for (let i = 0; i < sweep.iterations; i++) { + cells.push({ + provider, + prompt, + encoding, + optimization, + protocol, + app: appId, + scale, + scenario: scenario.name, + seed, + iteration: i, + }); + } + } + } + } + } else if (protocol === "mcp") { + const variants = sweep.mcpVariants ?? ["flat"]; + for (const mcpVariant of variants) { + for (let i = 0; i < sweep.iterations; i++) { + cells.push({ + provider, + prompt: "n/a", + encoding: "n/a", + optimization: "n/a", + protocol, + mcpVariant, + app: appId, + scale, + scenario: scenario.name, + seed, + iteration: i, + }); + } + } + } + } + } + } + } + } + } + return cells; +} + +function getOrCreateProvider(cache: Map, cfg: ProviderConfig): LlmProvider { + const key = `${cfg.kind}|${cfg.baseUrl ?? ""}|${cfg.model}|${cfg.id ?? ""}`; + const cached = cache.get(key); + if (cached) return cached; + let provider: LlmProvider; + switch (cfg.kind) { + case "openai-compat": + if (!cfg.baseUrl) throw new Error("openai-compat provider requires baseUrl"); + provider = new OpenAICompatProvider({ + baseUrl: cfg.baseUrl, + model: cfg.model, + apiKey: cfg.apiKey, + id: cfg.id, + }); + break; + default: + throw new Error(`Provider kind not yet implemented: ${cfg.kind}`); + } + cache.set(key, provider); + return provider; +} diff --git a/benchmarks/v2/runner/types.ts b/benchmarks/v2/runner/types.ts new file mode 100644 index 0000000..2e485c7 --- /dev/null +++ b/benchmarks/v2/runner/types.ts @@ -0,0 +1,99 @@ +export type AppId = "issue-tracker" | "todo" | "file-browser" | "crm"; +export type DataScale = "s" | "m" | "l" | "xl"; +export type Protocol = "slop" | "mcp"; + +export interface ProviderConfig { + kind: "openai-compat" | "gemini" | "anthropic"; + baseUrl?: string; + model: string; + apiKey?: string; + id?: string; +} + +export interface SweepConfig { + id: string; + providers: ProviderConfig[]; + promptVariants: string[]; + encodingVariants: string[]; + optimizationVariants: string[]; + protocols: Protocol[]; + mcpVariants?: string[]; + apps: AppId[]; + dataScales: DataScale[]; + scenarioFilter?: string[]; + seeds: number[]; + iterations: number; + maxConcurrency: number; + maxTurns: number; + temperature: number; +} + +export interface Cell { + provider: ProviderConfig; + prompt: string; + encoding: string; + optimization: string; + protocol: Protocol; + mcpVariant?: string; + app: AppId; + scale: DataScale; + scenario: string; + seed: number; + iteration: number; +} + +export interface TurnMetric { + index: number; + inputTokens: number; + outputTokens: number; + latencyMs: number; + toolCalls: number; + /** Assistant tool calls classified in this turn, for taxonomy. */ + toolCallKinds: ("slop_query" | "slop_get_state" | "affordance" | "unknown" | "param_error" | "invoke_error")[]; +} + +export interface CellMetrics { + turns: number; + toolCalls: number; + navigationToolCalls: number; + affordanceToolCalls: number; + unknownToolCalls: number; + /** Calls that hit the right affordance but threw during invoke. */ + invokeErrorCalls: number; + /** Calls that resolved to a valid affordance but had malformed params. */ + paramErrorCalls: number; + inputTokens: number; + outputTokens: number; + totalTokens: number; + /** Max prompt_tokens observed on any single turn — proxy for peak context pressure. */ + maxContextTokens: number; + /** Wall-clock ms from user prompt send to first assistant tool call. null = never called a tool. */ + timeToFirstToolCallMs: number | null; + setupTimeMs: number; + llmTimeMs: number; + totalTimeMs: number; + transportBytesSent: number; + transportBytesReceived: number; + /** affordanceToolCalls / (affordanceToolCalls + unknownToolCalls + paramErrorCalls). 1.0 = every tool call was a valid affordance. */ + specComplianceRate: number; + finishReason: "done" | "max_turns" | "error"; + turnBreakdown: TurnMetric[]; + verification?: { + passed: boolean; + totalChecks: number; + passedChecks: number; + failures: string[]; + }; +} + +export interface RunRecord { + sweepId: string; + cellId: string; + runId: string; + configHash: string; + cell: Cell; + metrics?: CellMetrics; + error?: string; + startedAt: string; + durationMs: number; +} diff --git a/benchmarks/v2/smoke/provider-test.ts b/benchmarks/v2/smoke/provider-test.ts new file mode 100644 index 0000000..4ce6388 --- /dev/null +++ b/benchmarks/v2/smoke/provider-test.ts @@ -0,0 +1,103 @@ +import { OpenAICompatProvider } from "../providers/openai-compat.ts"; +import type { ChatMessage, ToolDef } from "../providers/types.ts"; + +const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1"; +const MODEL = process.env.SLOP_SMOKE_MODEL ?? "gemma4:31b"; + +const tools: ToolDef[] = [ + { + name: "get_weather", + description: "Get current weather for a city", + parameters: { + type: "object", + properties: { city: { type: "string", description: "City name" } }, + required: ["city"], + }, + }, + { + name: "answer", + description: "Deliver the final answer to the user once you have enough information", + parameters: { + type: "object", + properties: { text: { type: "string" } }, + required: ["text"], + }, + }, +]; + +async function main() { + const provider = new OpenAICompatProvider({ baseUrl: DGX_URL, model: MODEL }); + console.log(`[smoke] provider=${provider.id} url=${DGX_URL}`); + + const history: ChatMessage[] = [ + { role: "user", content: "What's the weather in Tokyo? Report in one short sentence." }, + ]; + + const systemPrompt = + "You are an assistant that always uses tools when they can help. " + + "When you have a final answer, call the `answer` tool."; + + let totalInput = 0; + let totalOutput = 0; + let turn = 0; + const MAX_TURNS = 6; + const t0 = performance.now(); + + while (turn < MAX_TURNS) { + turn += 1; + const res = await provider.generate({ systemPrompt, messages: history, tools }); + totalInput += res.usage.inputTokens; + totalOutput += res.usage.outputTokens; + + console.log( + `[turn ${turn}] finish=${res.finishReason} in=${res.usage.inputTokens} out=${res.usage.outputTokens} latency=${res.rawLatencyMs.toFixed(0)}ms`, + ); + + history.push(res.message); + + if (!res.message.toolCalls || res.message.toolCalls.length === 0) { + console.log(`[turn ${turn}] assistant: ${res.message.content.slice(0, 200)}`); + break; + } + + for (const call of res.message.toolCalls) { + console.log(`[turn ${turn}] tool_call ${call.name}(${JSON.stringify(call.arguments)})`); + if (call.name === "answer") { + console.log(`\nFINAL ANSWER: ${String(call.arguments.text ?? "")}`); + printSummary(totalInput, totalOutput, t0, turn, true); + return; + } + const result = dispatchTool(call.name, call.arguments); + history.push({ + role: "tool", + content: JSON.stringify(result), + toolCallId: call.id, + name: call.name, + }); + } + } + + printSummary(totalInput, totalOutput, t0, turn, false); +} + +function dispatchTool(name: string, args: Record): unknown { + if (name === "get_weather") { + return { city: args.city ?? "unknown", temp_c: 18, conditions: "partly cloudy" }; + } + return { error: `unknown tool: ${name}` }; +} + +function printSummary(inTok: number, outTok: number, t0: number, turns: number, answered: boolean) { + const total = performance.now() - t0; + console.log("\n--- smoke summary ---"); + console.log(`turns: ${turns}`); + console.log(`input tok: ${inTok}`); + console.log(`output tok: ${outTok}`); + console.log(`total ms: ${total.toFixed(0)}`); + console.log(`answered: ${answered}`); +} + +main().catch((err) => { + console.error("[smoke] failed:", err); + process.exit(1); +}); diff --git a/benchmarks/v2/tsconfig.json b/benchmarks/v2/tsconfig.json new file mode 100644 index 0000000..1e4242a --- /dev/null +++ b/benchmarks/v2/tsconfig.json @@ -0,0 +1,16 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "allowImportingTsExtensions": true, + "noEmit": true, + "resolveJsonModule": true, + "types": ["bun-types"], + "lib": ["ES2022"] + }, + "include": ["**/*.ts"] +} diff --git a/benchmarks/v2/variants/encodings.ts b/benchmarks/v2/variants/encodings.ts new file mode 100644 index 0000000..587666f --- /dev/null +++ b/benchmarks/v2/variants/encodings.ts @@ -0,0 +1,157 @@ +import { formatTree } from "@slop-ai/consumer"; +import type { SlopNode } from "@slop-ai/consumer"; + +/** + * Tree encoders translate a SlopNode into the text string embedded in the + * system prompt. The encoder is orthogonal to server-side optimization — + * an "optimized" tree is still a SlopNode and can be projected via any + * encoding. + * + * Phase C ships five encodings. To compare cost vs. legibility on the + * same scenario, run the ablation config which sweeps all of them. + */ +export type TreeEncoder = (node: SlopNode) => string; + +const indentedText: TreeEncoder = (node) => formatTree(node); + +const jsonCompact: TreeEncoder = (node) => JSON.stringify(stripNode(node)); + +const jsonPretty: TreeEncoder = (node) => JSON.stringify(stripNode(node), null, 2); + +const yaml: TreeEncoder = (node) => { + const lines: string[] = []; + emitYaml(stripNode(node) as Record, 0, lines); + return lines.join("\n"); +}; + +const markdownHeadings: TreeEncoder = (node) => { + const lines: string[] = []; + emitMarkdown(node, 0, lines, ""); + return lines.join("\n"); +}; + +export const ENCODING_VARIANTS: Record = { + "indented-text": indentedText, + "json-compact": jsonCompact, + "json-pretty": jsonPretty, + yaml, + "markdown-headings": markdownHeadings, +}; + +export function resolveEncoding(id: string): TreeEncoder { + const fn = ENCODING_VARIANTS[id]; + if (!fn) throw new Error(`Unknown encoding variant: ${id}. Available: ${Object.keys(ENCODING_VARIANTS).join(", ")}`); + return fn; +} + +/** + * Strip the node tree down to a plain JSON-friendly object. We keep id, type, + * properties, children (recursively), affordances (as compact shapes), and + * meta. `content_ref` drops since benchmarks don't use large content payloads. + */ +function stripNode(node: SlopNode): Record { + const out: Record = { + id: node.id, + type: node.type, + }; + if (node.properties && Object.keys(node.properties).length > 0) out.properties = node.properties; + if (node.meta && Object.keys(node.meta).length > 0) out.meta = node.meta; + if (node.affordances && node.affordances.length > 0) { + out.affordances = node.affordances.map((a) => ({ + action: a.action, + ...(a.description && { description: a.description }), + ...(a.params && { params: a.params }), + })); + } + if (node.children && node.children.length > 0) out.children = node.children.map(stripNode); + return out; +} + +function emitYaml(value: unknown, indent: number, lines: string[]): void { + const pad = " ".repeat(indent); + if (value === null || value === undefined) { + lines[lines.length - 1] = `${lines[lines.length - 1]} null`; + return; + } + if (typeof value !== "object") { + lines[lines.length - 1] = `${lines[lines.length - 1]} ${yamlScalar(value)}`; + return; + } + if (Array.isArray(value)) { + if (value.length === 0) { + lines[lines.length - 1] = `${lines[lines.length - 1]} []`; + return; + } + for (const item of value) { + if (item !== null && typeof item === "object" && !Array.isArray(item)) { + const keys = Object.keys(item as Record); + if (keys.length === 0) { + lines.push(`${pad}- {}`); + continue; + } + lines.push(`${pad}- ${keys[0]}:`); + emitYaml((item as Record)[keys[0]], indent + 1, lines); + for (let i = 1; i < keys.length; i++) { + lines.push(`${pad} ${keys[i]}:`); + emitYaml((item as Record)[keys[i]], indent + 2, lines); + } + } else { + lines.push(`${pad}-`); + emitYaml(item, indent + 1, lines); + } + } + return; + } + const keys = Object.keys(value as Record); + if (keys.length === 0) { + lines[lines.length - 1] = `${lines[lines.length - 1]} {}`; + return; + } + for (const key of keys) { + lines.push(`${pad}${key}:`); + emitYaml((value as Record)[key], indent + 1, lines); + } +} + +function yamlScalar(v: unknown): string { + if (typeof v === "string") { + if (/^[\w.\-/]+$/.test(v) && v !== "null" && v !== "true" && v !== "false" && v !== "") return v; + return JSON.stringify(v); + } + return String(v); +} + +function emitMarkdown(node: SlopNode, depth: number, lines: string[], pathPrefix: string): void { + const heading = "#".repeat(Math.min(depth + 2, 6)); + const path = pathPrefix === "" ? `/${node.id}` : `${pathPrefix}/${node.id}`; + lines.push(`${heading} \`${node.type}\` ${node.id} \`${path}\``); + if (node.meta?.summary) lines.push(`> ${node.meta.summary}`); + if (node.properties && Object.keys(node.properties).length > 0) { + lines.push(""); + for (const [k, v] of Object.entries(node.properties)) lines.push(`- **${k}**: ${formatProp(v)}`); + } + if (node.affordances && node.affordances.length > 0) { + lines.push(""); + lines.push("actions:"); + for (const a of node.affordances) { + const params = a.params ? Object.keys((a.params as { properties?: Record }).properties ?? {}).join(", ") : ""; + lines.push(`- \`${a.action}(${params})\`${a.description ? ` — ${a.description}` : ""}`); + } + } + if (node.meta && (node.meta.total_children || node.meta.window || node.meta.salience !== undefined)) { + const metaBits: string[] = []; + if (node.meta.total_children !== undefined) metaBits.push(`total_children=${node.meta.total_children}`); + if (node.meta.window) metaBits.push(`window=${node.meta.window.join(",")}`); + if (node.meta.salience !== undefined) metaBits.push(`salience=${node.meta.salience}`); + if (metaBits.length > 0) lines.push(`_meta: ${metaBits.join(", ")}_`); + } + lines.push(""); + if (node.children) { + for (const child of node.children) emitMarkdown(child, depth + 1, lines, path); + } +} + +function formatProp(v: unknown): string { + if (typeof v === "string") return v; + return JSON.stringify(v); +} diff --git a/benchmarks/v2/variants/mcp-variants.ts b/benchmarks/v2/variants/mcp-variants.ts new file mode 100644 index 0000000..3454534 --- /dev/null +++ b/benchmarks/v2/variants/mcp-variants.ts @@ -0,0 +1,39 @@ +/** + * MCP variant registry — the "fair MCP" dimension. The cell runner consults + * this before falling back to the app's default mcpSystemPrompt, so adding + * a variant is just "add an entry, re-run the sweep." + * + * Phase C ships `flat` (current baseline, domain prompt only) and + * `flat+prompt` (domain prompt + extra guidance teaching the model how to + * behave in a flat-tool world — parity with SLOP's spec prompt). The two + * remaining variants from the plan (`resources`, `prompts`) need new MCP + * server entry points and are deferred. + */ +export type McpPromptBuilder = (appSystemPrompt: string) => string; + +const flat: McpPromptBuilder = (appPrompt) => appPrompt; + +const FLAT_PLUS_PROMPT_GUIDANCE = `\n +## How to use the tools + +The application exposes a flat list of tools. You do NOT get a tree of state upfront — you must discover state by calling list_* and get_* tools. Guidance: + +1. Start by calling the broadest list_* tool to understand what entities exist. Don't call get_* for individual items when you can list them. +2. Once you know what's out there, filter in your head — don't call a tool unless you need the result. +3. When you mutate state (mark_*, advance_*, set_*, delete_*), assume the change took effect unless the response says otherwise. Don't re-list to verify. +4. If a tool returns an error like "missing required fields", re-read the tool's input schema and call again with the missing parameters. +5. Tool call budgets matter — batch what you can in one turn rather than doing one-at-a-time round trips. +`; + +const flatPlusPrompt: McpPromptBuilder = (appPrompt) => appPrompt + FLAT_PLUS_PROMPT_GUIDANCE; + +export const MCP_VARIANTS: Record = { + flat, + "flat+prompt": flatPlusPrompt, +}; + +export function resolveMcpVariant(id: string): McpPromptBuilder { + const fn = MCP_VARIANTS[id]; + if (!fn) throw new Error(`Unknown mcp variant: ${id}. Available: ${Object.keys(MCP_VARIANTS).join(", ")}`); + return fn; +} diff --git a/benchmarks/v2/variants/optimizations.ts b/benchmarks/v2/variants/optimizations.ts new file mode 100644 index 0000000..fcdc1df --- /dev/null +++ b/benchmarks/v2/variants/optimizations.ts @@ -0,0 +1,33 @@ +import type { SlopServerOpts } from "../../mcp-vs-slop/app/slop-server.ts"; + +/** + * An "optimization" is how the SLOP server chooses to shape the tree it emits + * before the encoder sees it. Today v1 collapses this into a single + * `optimized: boolean` server option; v2 keeps the dimension open for when + * server-side salience / lazy / windowing become independently toggleable. + */ + +export type OptimizationVariant = { + id: string; + description: string; + serverOpts?: SlopServerOpts; +}; + +export const OPTIMIZATION_VARIANTS: Record = { + off: { + id: "off", + description: "No server-side optimization — full tree, every node, every child.", + serverOpts: undefined, + }, + combined: { + id: "combined", + description: "v1 'optimized' mode — salience scoring + lazy comments + summaries.", + serverOpts: { optimized: true }, + }, +}; + +export function resolveOptimization(id: string): OptimizationVariant { + const v = OPTIMIZATION_VARIANTS[id]; + if (!v) throw new Error(`Unknown optimization variant: ${id}. Available: ${Object.keys(OPTIMIZATION_VARIANTS).join(", ")}`); + return v; +} diff --git a/benchmarks/v2/variants/prompts.ts b/benchmarks/v2/variants/prompts.ts new file mode 100644 index 0000000..bb19ac6 --- /dev/null +++ b/benchmarks/v2/variants/prompts.ts @@ -0,0 +1,78 @@ +import { SLOP_SYSTEM_PROMPT } from "../../mcp-vs-slop/harness/slop-system-prompt.ts"; + +/** + * SLOP prompt library for the Phase C ablation. Each entry takes the + * app-specific state text (already rendered by the chosen encoder) and + * returns the full system prompt. The registry is extended by adding + * another entry — the cartesian sweep picks it up automatically. + */ +export type PromptBuilder = (stateContext: string) => string; + +const empty: PromptBuilder = (stateContext) => stateContext; + +const minimal: PromptBuilder = (stateContext) => + `You are an agent. Use the available tools to complete the user's task. ` + + `Respond with "DONE" when finished.\n\n## Current state\n\n${stateContext}`; + +// v1's "basic" prompt kept for regression continuity with the old harness. +const basic: PromptBuilder = (stateContext) => + `You are an agent. Here is the current state of the application:\n\n${stateContext}\n\n` + + `Use the available tools to complete the task. When done, respond with "DONE".`; + +const spec: PromptBuilder = (stateContext) => + `${SLOP_SYSTEM_PROMPT}${stateContext}\n\nComplete the task using the available tools. When done, respond with "DONE".`; + +// Half-length spec prompt — compressed to the essentials. Tests how much +// of the full framing is actually doing work vs. restating what's obvious. +const SPEC_TERSE_HEADER = `You are an agent interacting with an application via the SLOP protocol. + +The application exposes its state as a tree of nodes. Each node has: +- properties (data) +- affordances (actions currently available on this node — do not attempt actions that aren't listed) +- meta (optional hints like salience, summary, total_children) + +Tools: +- Node actions are named \`nodeId__action\` and perform the affordance. +- \`slop_query(path)\` — load a subtree (use for lazy nodes, stubs, or windowed collections). +- \`slop_get_state\` — read the full tree. + +Affordances are contextual — they may change after you act. A hidden action is an action you cannot perform right now. + +## Current state + +`; + +const specTerse: PromptBuilder = (stateContext) => + `${SPEC_TERSE_HEADER}${stateContext}\n\nComplete the task using the available tools. When done, respond with "DONE".`; + +// Role-play framing — same information, but packaged as a persona. Tests +// whether the model responds to instruction-following framing over raw +// specification language. +const ROLE_PLAY_HEADER = `You are a careful operations engineer working inside an application. The application shows you its current state as a tree, and the tree tells you which actions are available on which parts of the state. + +Your rules: +1. Never attempt an action that isn't explicitly listed as an affordance on the node you want to act on. +2. If you can't see the thing you need, call \`slop_query\` on the path you expect, or \`slop_get_state\` to re-read the tree. +3. After you act, check whether the tree changed and whether the affordances you need still exist. + +## Current state + +`; + +const rolePlay: PromptBuilder = (stateContext) => + `${ROLE_PLAY_HEADER}${stateContext}\n\nComplete the task using the available tools. When done, respond with "DONE".`; + +export const PROMPT_VARIANTS: Record = { + empty, + minimal, + basic, + spec, + "spec-terse": specTerse, + "role-play": rolePlay, +}; + +export function resolvePrompt(id: string): PromptBuilder { + const fn = PROMPT_VARIANTS[id]; + if (!fn) throw new Error(`Unknown prompt variant: ${id}. Available: ${Object.keys(PROMPT_VARIANTS).join(", ")}`); + return fn; +} diff --git a/bun.lock b/bun.lock index 029f302..5b4d517 100644 --- a/bun.lock +++ b/bun.lock @@ -46,6 +46,18 @@ "bun-types": "^1.3.11", }, }, + "benchmarks/v2": { + "name": "slop-benchmarks-v2", + "dependencies": { + "@modelcontextprotocol/sdk": "^1.29.0", + "@slop-ai/consumer": "workspace:*", + "@slop-ai/core": "workspace:*", + "@slop-ai/server": "workspace:*", + }, + "devDependencies": { + "bun-types": "^1.3.11", + }, + }, "examples/cli/bun": { "name": "tsk", "version": "0.1.0", @@ -2545,6 +2557,8 @@ "slice-ansi": ["slice-ansi@5.0.0", "", { "dependencies": { "ansi-styles": "^6.0.0", "is-fullwidth-code-point": "^4.0.0" } }, "sha512-FC+lgizVPfie0kkhqUScwRu1O/lF6NOgJmlCgK+/LYxDCTk8sGelYaHDhFcDN+Sn3Cv+3VSa4Byeo+IMCzpMgQ=="], + "slop-benchmarks-v2": ["slop-benchmarks-v2@workspace:benchmarks/v2"], + "slop-bridge-mcp-proxy": ["slop-bridge-mcp-proxy@workspace:packages/typescript/integrations/claude/slop-mcp-proxy/servers"], "slop-bridge-native": ["slop-bridge-native@workspace:packages/typescript/integrations/claude/slop-native/servers"], diff --git a/package.json b/package.json index 6f5aeff..849101c 100644 --- a/package.json +++ b/package.json @@ -19,7 +19,8 @@ "website/playground", "apps/extension", "apps/desktop", - "benchmarks/mcp-vs-slop" + "benchmarks/mcp-vs-slop", + "benchmarks/v2" ], "scripts": { "build": "bun run scripts/build-typescript-packages.ts",