diff --git a/benchmarks/v2/README.md b/benchmarks/v2/README.md
new file mode 100644
index 0000000..d4fe54a
--- /dev/null
+++ b/benchmarks/v2/README.md
@@ -0,0 +1,61 @@
+# SLOP Benchmarks v2 — Experiment rig (WIP)
+
+Successor to [`benchmarks/mcp-vs-slop`](../mcp-vs-slop/). v1 stays in place as a regression anchor; v2 turns it into a proper experiment framework so we can drive SLOP v0.2 spec decisions from data.
+
+Design spec: [fluffy-napping-walrus.md](../../.claude/plans/fluffy-napping-walrus.md) (local plan file).
+
+## Status
+
+- [x] Phase A — DGX inference path (OpenAI-compat provider + smoke test)
+- [ ] Phase B — Sweep runner + config matrix
+- [ ] Phase C — Prompt / encoding / optimization variants
+- [ ] Phase C' — Fair-MCP variants
+- [ ] Phase D — Metrics + statistical post-processing
+- [ ] Phase E — Static dashboard
+- [ ] Phase F — App complexity ladder (todo, file-browser, crm)
+
+## DGX Spark setup
+
+Models are served via Ollama on `slopinator-s-1.local`. The systemd unit has an override that binds Ollama to all interfaces on both address families:
+
+```ini
+# /etc/systemd/system/ollama.service.d/override.conf
+[Service]
+Environment=OLLAMA_HOST=[::]:11434
+```
+
+`::` binds IPv4 and IPv6 — required because Bun's fetch resolves `.local` names to IPv6 first and doesn't fall back. If the override is ever lost, Bun will report `ConnectionRefused` while curl still works; that's the tell.
+
+## Smoke test
+
+```bash
+cd benchmarks/v2
+bun run smoke/provider-test.ts
+SLOP_SMOKE_MODEL=nemotron-3-super:120b bun run smoke/provider-test.ts
+```
+
+Runs a multi-turn tool-calling conversation (weather lookup → answer) against the configured model. Prints per-turn token counts, latency, and whether the model successfully delivered the final answer tool-call. Fails loudly if the OpenAI-compat endpoint misbehaves.
+
+## Environment variables
+
+| Var | Default | Notes |
+|---|---|---|
+| `SLOP_DGX_URL` | `http://slopinator-s-1.local:11434/v1` | Override to point at a different host |
+| `SLOP_SMOKE_MODEL` | `gemma4:31b` | Any model in `ollama list` |
+
+## Layout (target)
+
+```
+v2/
+├── providers/              # LlmProvider interface + adapters
+│   ├── types.ts
+│   └── openai-compat.ts    # Ollama, vLLM, OpenAI, anything /v1-compatible
+├── variants/               # prompts/, encodings/, optimizations/ (Phase C)
+├── mcp-variants/           # fair-MCP pass (Phase C')
+├── apps/                   # todo / file-browser / issue-tracker / crm (Phase F)
+├── scenarios/              # shared scenario types
+├── metrics/                # collectors + stats (Phase D)
+├── runner/                 # sweep orchestrator (Phase B)
+├── dashboard/              # static HTML report (Phase E)
+└── smoke/                  # validation scripts
+```
diff --git a/benchmarks/v2/apps/crm/index.ts b/benchmarks/v2/apps/crm/index.ts
new file mode 100644
index 0000000..97d58ee
--- /dev/null
+++ b/benchmarks/v2/apps/crm/index.ts
@@ -0,0 +1,88 @@
+import { Client } from "@modelcontextprotocol/sdk/client";
+import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
+import { CrmStore } from "./store.ts";
+import { seedCrm } from "./seed.ts";
+import { startCrmSlopServer, type CrmSlopOpts } from "./slop-server.ts";
+import { crmScenarios } from "./scenarios.ts";
+import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "../registry.ts";
+import type { DataScale } from "../../runner/types.ts";
+import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts";
+
+function wrap(inner: CrmStore): AppStore & { inner: CrmStore } {
+  return { __brand: "app-store", inner } as AppStore & { inner: CrmStore };
+}
+
+export const crmApp: AppBinding = {
+  id: "crm",
+  supportedScales: ["s", "m", "l", "xl"],
+  createStore(scale, seed) {
+    const store = new CrmStore();
+    const { contacts, deals, activities } = seedCrm(scale, seed);
+    store.reset(contacts, deals, activities);
+    return wrap(store);
+  },
+  async startSlopServer(store, port, opts): Promise<SlopServerHandle> {
+    const inner = (store as unknown as { inner: CrmStore }).inner;
+    const { server, slop } = startCrmSlopServer(inner, port, opts as CrmSlopOpts | undefined);
+    return {
+      wsUrl: `ws://localhost:${port}/slop`,
+      stop: async () => {
+        slop.stop();
+        server.stop();
+      },
+    };
+  },
+  scenarios: crmScenarios,
+  verify(store, scenario) {
+    if (!scenario.verify) return undefined;
+    const inner = (store as unknown as { inner: CrmStore }).inner;
+    return scenario.verify(inner as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
+  },
+  mcpSystemPrompt:
+    "You are a CRM agent. You have tools to list and mutate contacts, deals, and activities. " +
+    "You have no prior knowledge of the data — discover it using list_* and get_* tools. " +
+    'When the task is complete, respond with "DONE".',
+  async startMcpServer(scale: DataScale, _variant: string): Promise<McpServerHandle> {
+    // All current MCP variants share the flat server; prompt-level variants
+    // are applied by the cell runner via resolveMcpVariant.
+    const env: Record<string, string> = { ...process.env } as Record<string, string>;
+    env.BENCH_SCALE = scale;
+    env.BENCH_SEED = String(42);
+    const serverPath = new URL("./mcp-server.ts", import.meta.url).pathname;
+    const transport = new StdioClientTransport({
+      command: "bun",
+      args: ["run", serverPath],
+      env,
+    });
+    const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" });
+    await client.connect(transport);
+    return {
+      client,
+      stop: async () => {
+        await client.close();
+      },
+      verify: async (scenario: Scenario): Promise<VerificationResult | undefined> => {
+        if (!scenario.verify) return undefined;
+        // Reconstruct by listing all three entity collections.
+        const tempStore = new CrmStore();
+        const [cRes, dRes, aRes] = await Promise.all([
+          client.callTool({ name: "list_contacts", arguments: {} }),
+          client.callTool({ name: "list_deals", arguments: {} }),
+          client.callTool({ name: "list_activities", arguments: {} }),
+        ]);
+        tempStore.reset(parseJson(cRes), parseJson(dRes), parseJson(aRes));
+        return scenario.verify(tempStore as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
+      },
+    };
+  },
+};
+
+function parseJson(result: unknown): any[] {
+  const content = (result as { content?: Array<{ type: string; text?: string }> }).content ?? [];
+  const text = content.find((c) => c.type === "text")?.text ?? "[]";
+  try {
+    return JSON.parse(text) ?? [];
+  } catch {
+    return [];
+  }
+}
diff --git a/benchmarks/v2/apps/crm/mcp-server.ts b/benchmarks/v2/apps/crm/mcp-server.ts
new file mode 100644
index 0000000..9e9b981
--- /dev/null
+++ b/benchmarks/v2/apps/crm/mcp-server.ts
@@ -0,0 +1,120 @@
+/**
+ * Stdio MCP server for the crm benchmark app. Spawned as a child process by
+ * the MCP cell runner. Env vars:
+ * - BENCH_SCALE = s | m | l | xl
+ * - BENCH_SEED  = integer
+ */
+
+import { Server } from "@modelcontextprotocol/sdk/server";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
+import { CrmStore, type ActivityType, type DealStage } from "./store.ts";
+import { seedCrm } from "./seed.ts";
+import type { DataScale } from "../../runner/types.ts";
+
+const scale = (process.env.BENCH_SCALE as DataScale | undefined) ?? "s";
+const seed = Number(process.env.BENCH_SEED ?? 42);
+
+const store = new CrmStore();
+const { contacts, deals, activities } = seedCrm(scale, seed);
+store.reset(contacts, deals, activities);
+
+const server = new Server({ name: "crm-mcp", version: "0.2.0" }, { capabilities: { tools: {} } });
+
+server.setRequestHandler(ListToolsRequestSchema, async () => ({
+  tools: [
+    { name: "list_contacts", description: "List every contact", inputSchema: { type: "object" as const, properties: {} } },
+    { name: "list_deals", description: "List every deal. Optional filter by stage.", inputSchema: { type: "object" as const, properties: { stage: { type: "string", description: "lead|qualified|proposal|won|lost" } } } },
+    { name: "list_activities", description: "List every activity. Optional filter by dealId or contactId.", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" }, contact_id: { type: "string" } } } },
+    { name: "get_contact", description: "Get a contact by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
+    { name: "get_deal", description: "Get a deal by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
+    { name: "get_activity", description: "Get an activity by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
+    { name: "deals_for_contact", description: "Return every deal belonging to a contact", inputSchema: { type: "object" as const, properties: { contact_id: { type: "string" } }, required: ["contact_id"] } },
+    { name: "activities_for_deal", description: "Return every activity attached to a deal", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" } }, required: ["deal_id"] } },
+    { name: "activities_for_contact", description: "Return every activity attached to a contact", inputSchema: { type: "object" as const, properties: { contact_id: { type: "string" } }, required: ["contact_id"] } },
+    { name: "advance_deal_stage", description: "Set a deal's stage", inputSchema: { type: "object" as const, properties: { id: { type: "string" }, stage: { type: "string", description: "lead|qualified|proposal|won|lost" } }, required: ["id", "stage"] } },
+    { name: "set_deal_value", description: "Set a deal's USD value", inputSchema: { type: "object" as const, properties: { id: { type: "string" }, value: { type: "number" } }, required: ["id", "value"] } },
+    { name: "add_activity", description: "Create a new activity on a deal or contact. Provide deal_id XOR contact_id.", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" }, contact_id: { type: "string" }, type: { type: "string", description: "call|email|meeting|note" }, subject: { type: "string" }, body: { type: "string" } }, required: ["type", "subject", "body"] } },
+    { name: "delete_contact", description: "Delete a contact", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
+    { name: "delete_deal", description: "Delete a deal", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
+    { name: "delete_activity", description: "Delete an activity", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
+  ],
+}));
+
+server.setRequestHandler(CallToolRequestSchema, async (req) => {
+  const { name, arguments: args } = req.params;
+  const a = (args ?? {}) as Record<string, unknown>;
+  try {
+    switch (name) {
+      case "list_contacts":
+        return json(store.contacts);
+      case "list_deals": {
+        const stage = a.stage ? String(a.stage) : undefined;
+        const deals = stage ? store.deals.filter((d) => d.stage === stage) : store.deals;
+        return json(deals);
+      }
+      case "list_activities": {
+        let out = store.activities;
+        if (a.deal_id) out = out.filter((x) => x.dealId === a.deal_id);
+        if (a.contact_id) out = out.filter((x) => x.contactId === a.contact_id);
+        return json(out);
+      }
+      case "get_contact": return store.getContact(String(a.id)) ? json(store.getContact(String(a.id))) : err(`contact ${a.id} not found`);
+      case "get_deal": return store.getDeal(String(a.id)) ? json(store.getDeal(String(a.id))) : err(`deal ${a.id} not found`);
+      case "get_activity": return store.getActivity(String(a.id)) ? json(store.getActivity(String(a.id))) : err(`activity ${a.id} not found`);
+      case "deals_for_contact": return json(store.dealsForContact(String(a.contact_id)));
+      case "activities_for_deal": return json(store.activitiesForDeal(String(a.deal_id)));
+      case "activities_for_contact": return json(store.activitiesForContact(String(a.contact_id)));
+      case "advance_deal_stage": {
+        const stage = String(a.stage);
+        if (!["lead", "qualified", "proposal", "won", "lost"].includes(stage)) return err(`invalid stage ${stage}`);
+        store.advanceStage(String(a.id), stage as DealStage);
+        return json({ id: a.id, stage });
+      }
+      case "set_deal_value":
+        store.setDealValue(String(a.id), Number(a.value));
+        return json({ id: a.id, value: Number(a.value) });
+      case "add_activity": {
+        const missing = ["type", "subject", "body"].filter((k) => a[k] == null);
+        if (missing.length > 0) return err(`missing required fields: ${missing.join(", ")}`);
+        const type = String(a.type);
+        if (!["call", "email", "meeting", "note"].includes(type)) return err(`invalid type: ${type} (expected call|email|meeting|note)`);
+        const dealId = a.deal_id ? String(a.deal_id) : null;
+        const contactId = a.contact_id ? String(a.contact_id) : null;
+        if (dealId && contactId) return err("provide deal_id OR contact_id, not both");
+        if (!dealId && !contactId) return err("provide deal_id OR contact_id");
+        const activity = store.addActivity({
+          dealId,
+          contactId,
+          type: type as ActivityType,
+          subject: String(a.subject),
+          body: String(a.body),
+        });
+        return json(activity);
+      }
+      case "delete_contact":
+        store.deleteContact(String(a.id));
+        return json({ deleted: a.id });
+      case "delete_deal":
+        store.deleteDeal(String(a.id));
+        return json({ deleted: a.id });
+      case "delete_activity":
+        store.deleteActivity(String(a.id));
+        return json({ deleted: a.id });
+      default:
+        return err(`unknown tool ${name}`);
+    }
+  } catch (e) {
+    return err(e instanceof Error ? e.message : String(e));
+  }
+});
+
+function json(data: unknown) {
+  return { content: [{ type: "text", text: JSON.stringify(data) }] };
+}
+function err(msg: string) {
+  return { content: [{ type: "text", text: JSON.stringify({ error: msg }) }], isError: true };
+}
+
+const transport = new StdioServerTransport();
+await server.connect(transport);
diff --git a/benchmarks/v2/apps/crm/scenarios.ts b/benchmarks/v2/apps/crm/scenarios.ts
new file mode 100644
index 0000000..a48e8db
--- /dev/null
+++ b/benchmarks/v2/apps/crm/scenarios.ts
@@ -0,0 +1,152 @@
+import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts";
+import type { CrmStore } from "./store.ts";
+
+const empty: Scenario["steps"] = [];
+
+/**
+ * Scenario 1 — qualify-leads: multi-entity reasoning.
+ * "For every deal in stage=lead that has at least one 'call' activity,
+ *  advance it to stage=qualified."
+ *
+ * Tests the agent's ability to correlate across deals and activities before
+ * acting. MCP needs list_deals + list_activities; SLOP can see both in the
+ * tree at once.
+ */
+function verifyQualifyLeads(store: CrmStore): VerificationResult {
+  const checks: VerificationResult["checks"] = [];
+  for (const deal of store.deals) {
+    const hadCall = store.activitiesForDeal(deal.id).some((a) => a.type === "call");
+    // We can't know original stage from final state alone, so we accept:
+    //  - "was lead with call → now qualified"   (correct)
+    //  - "was not lead → unchanged"             (correct; agent didn't touch)
+    // The stringent check: any deal still in stage=lead with a call activity
+    // is a miss.
+    if (deal.stage === "lead" && hadCall) {
+      checks.push({
+        name: `${deal.id} lead-with-call advanced to qualified`,
+        passed: false,
+        detail: `deal still in lead with call activity`,
+      });
+    }
+  }
+  // Positive signal: at least one deal must be in stage=qualified (the seed
+  // pinned two lead deals with calls; the agent must have advanced them).
+  const anyQualified = store.deals.some((d) => d.stage === "qualified");
+  checks.push({
+    name: "at least one deal advanced to qualified",
+    passed: anyQualified,
+  });
+  return { passed: checks.every((c) => c.passed), checks };
+}
+
+/**
+ * Scenario 2 — high-value-alert: filter + mutate.
+ * "Add a note activity with subject='High value' and body='flagged' to every
+ *  deal where valueUsd > 50000."
+ *
+ * Tests filtering-then-acting. Verifier checks: every high-value deal has a
+ * new 'note' activity with the exact subject; low-value deals do not.
+ */
+function verifyHighValueAlert(store: CrmStore): VerificationResult {
+  const checks: VerificationResult["checks"] = [];
+  const highValueDeals = store.deals.filter((d) => d.valueUsd > 50000);
+  for (const deal of highValueDeals) {
+    const hasAlert = store
+      .activitiesForDeal(deal.id)
+      .some((a) => a.type === "note" && /high\s*value/i.test(a.subject));
+    checks.push({
+      name: `${deal.id} flagged as high value`,
+      passed: hasAlert,
+      detail: hasAlert ? undefined : `no note with subject "High value" on deal valued $${deal.valueUsd}`,
+    });
+  }
+  const lowValueDeals = store.deals.filter((d) => d.valueUsd <= 50000);
+  for (const deal of lowValueDeals) {
+    const falseAlert = store
+      .activitiesForDeal(deal.id)
+      .some((a) => a.type === "note" && /high\s*value/i.test(a.subject));
+    if (falseAlert) {
+      checks.push({
+        name: `${deal.id} should not be flagged high value`,
+        passed: false,
+        detail: `low-value deal ($${deal.valueUsd}) was incorrectly flagged`,
+      });
+    }
+  }
+  return { passed: checks.every((c) => c.passed), checks };
+}
+
+/**
+ * Scenario 3 — contact-cleanup: orphan detection.
+ * "Delete every contact that has no deals and no activities."
+ *
+ * The seed guarantees two orphan contacts (`orphan-1`, `orphan-2`). The
+ * agent must identify and delete them. The verifier also checks that no
+ * non-orphan contact was deleted.
+ */
+function verifyContactCleanup(store: CrmStore): VerificationResult {
+  const orphansGone =
+    store.getContact("orphan-1") === undefined && store.getContact("orphan-2") === undefined;
+  // Count how many of the seed's non-orphan contacts survive. The seed has
+  // sizes.contacts main contacts; each has at least one deal, so none are
+  // candidates for deletion.
+  const remainingMain = store.contacts.filter((c) => !c.id.startsWith("orphan")).length;
+  // We don't know the exact seed count here, so the check is qualitative:
+  // any contact with existing deals/activities must remain.
+  const wronglyDeleted: string[] = [];
+  // There's no way to know if a contact was wrongly deleted without a pre-
+  // snapshot, but we can check that every surviving deal still has a valid
+  // contactId.
+  for (const d of store.deals) {
+    if (!store.getContact(d.contactId)) wronglyDeleted.push(d.contactId);
+  }
+  return {
+    passed: orphansGone && wronglyDeleted.length === 0,
+    checks: [
+      {
+        name: "both orphan contacts deleted",
+        passed: orphansGone,
+        detail: orphansGone ? undefined : "at least one orphan contact still present",
+      },
+      {
+        name: "no contact deleted that still has deals",
+        passed: wronglyDeleted.length === 0,
+        detail: wronglyDeleted.length === 0 ? undefined : `${wronglyDeleted.length} deals orphaned by contact deletion`,
+      },
+      {
+        name: "non-orphan contacts preserved",
+        passed: remainingMain > 0,
+      },
+    ],
+  };
+}
+
+export const crmScenarios: Scenario[] = [
+  {
+    name: "qualify-leads",
+    description: "Advance every lead-stage deal that has a call activity to qualified.",
+    agentPrompt:
+      "Find every deal currently in stage 'lead' that has at least one activity of type 'call'. " +
+      "Advance each of those deals to stage 'qualified'. Don't touch any other deals.",
+    steps: empty,
+    verify: (store) => verifyQualifyLeads(store as unknown as CrmStore),
+  },
+  {
+    name: "high-value-alert",
+    description: "Attach a 'High value' note to every deal worth more than $50k.",
+    agentPrompt:
+      "For every deal with a value greater than $50,000, attach a new activity of type 'note' " +
+      "with subject 'High value' and body 'flagged'. Don't attach anything to deals at or below $50,000.",
+    steps: empty,
+    verify: (store) => verifyHighValueAlert(store as unknown as CrmStore),
+  },
+  {
+    name: "contact-cleanup",
+    description: "Delete every contact who has no deals and no activities.",
+    agentPrompt:
+      "Find every contact who has zero associated deals and zero associated activities. Delete those contacts. " +
+      "Don't delete any contact who still has deals or activities linked to them.",
+    steps: empty,
+    verify: (store) => verifyContactCleanup(store as unknown as CrmStore),
+  },
+];
diff --git a/benchmarks/v2/apps/crm/seed.ts b/benchmarks/v2/apps/crm/seed.ts
new file mode 100644
index 0000000..21135da
--- /dev/null
+++ b/benchmarks/v2/apps/crm/seed.ts
@@ -0,0 +1,109 @@
+import type { DataScale } from "../../runner/types.ts";
+import type { Activity, ActivityType, Contact, Deal, DealStage } from "./store.ts";
+
+const COMPANIES = ["Acme Co", "Globex", "Initech", "Umbrella", "Hooli", "Stark Industries"];
+const NAMES = ["Alice", "Bob", "Carol", "Dan", "Erin", "Frank", "Grace", "Heidi", "Ivan", "Judy"];
+const ROLES = ["CEO", "CTO", "VP Sales", "Engineering Lead", "Head of Ops"];
+const STAGES: DealStage[] = ["lead", "qualified", "proposal", "won", "lost"];
+const ACTIVITY_TYPES: ActivityType[] = ["call", "email", "meeting", "note"];
+
+const SIZES: Record<DataScale, { contacts: number; deals: number; activities: number }> = {
+  s: { contacts: 5, deals: 8, activities: 12 },
+  m: { contacts: 25, deals: 40, activities: 60 },
+  l: { contacts: 100, deals: 200, activities: 400 },
+  xl: { contacts: 500, deals: 1000, activities: 2500 },
+};
+
+function makeRng(seed: number) {
+  let x = seed || 0x2abcdef;
+  return () => {
+    x ^= x << 13;
+    x ^= x >>> 17;
+    x ^= x << 5;
+    return ((x >>> 0) % 1_000_000) / 1_000_000;
+  };
+}
+
+/**
+ * Deterministic seed output. Guarantees:
+ * - every contact has between 1 and 3 deals (so contact-cleanup always has
+ *   candidates that match "no deals and no activities" — we inject a few
+ *   orphan contacts past the main loop)
+ * - at least one deal in each stage when counts permit
+ * - at least one deal with valueUsd > $50k (targets for high-value-alert)
+ * - at least two deals in stage=lead with ≥1 'call' activity (targets for
+ *   qualify-leads)
+ */
+export function seedCrm(scale: DataScale, seed: number): {
+  contacts: Contact[];
+  deals: Deal[];
+  activities: Activity[];
+} {
+  const rng = makeRng(seed);
+  const sizes = SIZES[scale];
+  const contacts: Contact[] = [];
+  const deals: Deal[] = [];
+  const activities: Activity[] = [];
+
+  for (let i = 0; i < sizes.contacts; i++) {
+    contacts.push({
+      id: `contact-${i + 1}`,
+      name: `${NAMES[i % NAMES.length]} #${i + 1}`,
+      company: COMPANIES[Math.floor(rng() * COMPANIES.length)],
+      email: `person${i + 1}@example.com`,
+      role: ROLES[Math.floor(rng() * ROLES.length)],
+    });
+  }
+  // Inject two orphan contacts at the end with no deals and no activities.
+  // Deleting these is the job of the `contact-cleanup` scenario.
+  contacts.push(
+    { id: "orphan-1", name: "Orphan One", company: "No Company", email: "o1@example.com", role: "N/A" },
+    { id: "orphan-2", name: "Orphan Two", company: "No Company", email: "o2@example.com", role: "N/A" },
+  );
+
+  const mainContacts = contacts.filter((c) => !c.id.startsWith("orphan"));
+  for (let i = 0; i < sizes.deals; i++) {
+    const contact = mainContacts[i % mainContacts.length];
+    const stage = STAGES[Math.floor(rng() * STAGES.length)];
+    // One in four deals gets pushed above $50k so high-value-alert has targets
+    const baseValue = 5000 + Math.floor(rng() * 30000);
+    const value = rng() < 0.25 ? 50000 + Math.floor(rng() * 80000) : baseValue;
+    deals.push({
+      id: `deal-${i + 1}`,
+      contactId: contact.id,
+      title: `Contract ${i + 1} — ${contact.company}`,
+      valueUsd: value,
+      stage,
+    });
+  }
+
+  // Force at least two lead-stage deals with a call activity — qualify-leads targets.
+  if (deals.length >= 2) {
+    deals[0].stage = "lead";
+    deals[1].stage = "lead";
+  }
+
+  for (let i = 0; i < sizes.activities; i++) {
+    // ~70% link to a deal, the rest to a contact
+    const toDeal = rng() < 0.7;
+    const deal = toDeal ? deals[Math.floor(rng() * deals.length)] : null;
+    const contact = deal ? null : mainContacts[Math.floor(rng() * mainContacts.length)];
+    activities.push({
+      id: `act-${i + 1}`,
+      dealId: deal?.id ?? null,
+      contactId: contact?.id ?? null,
+      type: ACTIVITY_TYPES[Math.floor(rng() * ACTIVITY_TYPES.length)],
+      subject: `Touchpoint ${i + 1}`,
+      body: "Follow-up notes.",
+    });
+  }
+  // Force call activities on the two pinned lead deals
+  if (deals.length >= 2) {
+    activities.push(
+      { id: "act-seed-call-1", dealId: deals[0].id, contactId: null, type: "call", subject: "Intro call", body: "Initial conversation" },
+      { id: "act-seed-call-2", dealId: deals[1].id, contactId: null, type: "call", subject: "Discovery call", body: "Scoping the opportunity" },
+    );
+  }
+
+  return { contacts, deals, activities };
+}
diff --git a/benchmarks/v2/apps/crm/slop-server.ts b/benchmarks/v2/apps/crm/slop-server.ts
new file mode 100644
index 0000000..6b80d2e
--- /dev/null
+++ b/benchmarks/v2/apps/crm/slop-server.ts
@@ -0,0 +1,296 @@
+import { SlopServer } from "@slop-ai/server";
+import { bunHandler } from "@slop-ai/server/bun";
+import type { NodeDescriptor } from "@slop-ai/core";
+import type { ActivityType, Contact, CrmStore, Deal, DealStage, Activity } from "./store.ts";
+
+export interface CrmSlopOpts {
+  maxNodes?: number;
+  maxDepth?: number;
+  /**
+   * optimized=true: salience scoring across deals and activities, plus a
+   * windowed deals collection ordered by relevance (open pipeline first).
+   */
+  optimized?: boolean;
+}
+
+export function createCrmSlopServer(store: CrmStore, opts?: CrmSlopOpts) {
+  const slop = new SlopServer({
+    id: "crm",
+    name: "CRM",
+    ...(opts?.maxNodes != null && { maxNodes: opts.maxNodes }),
+    ...(opts?.maxDepth != null && { maxDepth: opts.maxDepth }),
+  });
+
+  const optimized = opts?.optimized ?? false;
+
+  slop.register("overview", () => {
+    const stageCounts = countByStage(store.deals);
+    const totalValue = store.deals.reduce((s, d) => s + d.valueUsd, 0);
+    const highValueCount = store.deals.filter((d) => d.valueUsd > 50000).length;
+    return {
+      type: "context",
+      props: {
+        contacts: store.contacts.length,
+        deals: store.deals.length,
+        activities: store.activities.length,
+        pipeline_value_usd: totalValue,
+        lead: stageCounts.lead,
+        qualified: stageCounts.qualified,
+        proposal: stageCounts.proposal,
+        won: stageCounts.won,
+        lost: stageCounts.lost,
+      },
+      summary:
+        `${store.contacts.length} contacts, ${store.deals.length} deals ` +
+        `(${stageCounts.lead}L/${stageCounts.qualified}Q/${stageCounts.proposal}P/${stageCounts.won}W/${stageCounts.lost}⊘), ` +
+        `${store.activities.length} activities. ${highValueCount} deals >$50k.`,
+    };
+  });
+
+  slop.register("contacts", () => {
+    return {
+      type: "collection",
+      props: { count: store.contacts.length },
+      summary: optimized ? `${store.contacts.length} contacts` : undefined,
+      children: Object.fromEntries(
+        store.contacts.map((c) => [c.id, buildContactNode(store, slop, c, optimized)]),
+      ),
+    } satisfies NodeDescriptor;
+  });
+
+  slop.register("deals", () => {
+    const all = store.deals;
+    if (optimized) {
+      const scored = all.map((d) => ({ d, salience: dealSalience(d) }));
+      scored.sort((a, b) => b.salience - a.salience);
+      return {
+        type: "collection",
+        props: { count: all.length },
+        summary: summarizeDeals(all),
+        children: Object.fromEntries(
+          scored.map(({ d, salience }) => [d.id, buildDealNode(store, slop, d, salience)]),
+        ),
+      } satisfies NodeDescriptor;
+    }
+    return {
+      type: "collection",
+      props: { count: all.length },
+      children: Object.fromEntries(all.map((d) => [d.id, buildDealNode(store, slop, d)])),
+    } satisfies NodeDescriptor;
+  });
+
+  slop.register("activities", () => {
+    return {
+      type: "collection",
+      props: { count: store.activities.length },
+      summary: optimized ? summarizeActivities(store.activities) : undefined,
+      children: Object.fromEntries(
+        store.activities.map((a) => [a.id, buildActivityNode(store, slop, a)]),
+      ),
+    } satisfies NodeDescriptor;
+  });
+
+  return slop;
+}
+
+function countByStage(deals: Deal[]): Record<DealStage, number> {
+  const counts: Record<DealStage, number> = { lead: 0, qualified: 0, proposal: 0, won: 0, lost: 0 };
+  for (const d of deals) counts[d.stage] += 1;
+  return counts;
+}
+
+function summarizeDeals(deals: Deal[]): string {
+  const counts = countByStage(deals);
+  const highValue = deals.filter((d) => d.valueUsd > 50000).length;
+  return `${deals.length} deals: ${counts.lead}L/${counts.qualified}Q/${counts.proposal}P/${counts.won}W/${counts.lost}⊘, ${highValue} >$50k`;
+}
+
+function summarizeActivities(acts: Activity[]): string {
+  const byType: Record<string, number> = { call: 0, email: 0, meeting: 0, note: 0 };
+  for (const a of acts) byType[a.type] = (byType[a.type] ?? 0) + 1;
+  return `${acts.length} activities: ${byType.call} calls, ${byType.email} emails, ${byType.meeting} meetings, ${byType.note} notes`;
+}
+
+function dealSalience(d: Deal): number {
+  const stageScore: Record<DealStage, number> = { lead: 0.5, qualified: 0.7, proposal: 0.8, won: 0.2, lost: 0.1 };
+  const valueBoost = Math.min(0.3, d.valueUsd / 500_000);
+  return Math.min(1, stageScore[d.stage] + valueBoost);
+}
+
+function buildContactNode(store: CrmStore, slop: SlopServer, c: Contact, _optimized: boolean): NodeDescriptor {
+  return {
+    type: "crm:contact",
+    props: {
+      name: c.name,
+      company: c.company,
+      email: c.email,
+      role: c.role,
+      deal_count: store.dealsForContact(c.id).length,
+      activity_count: store.activitiesForContact(c.id).length,
+    },
+    actions: {
+      edit_role: {
+        label: "Edit role",
+        description: "Change this contact's role",
+        params: { role: { type: "string", description: "New role" } },
+        handler: async (p) => {
+          const target = store.getContact(c.id);
+          if (target) target.role = String(p.role);
+          slop.refresh();
+          return { id: c.id };
+        },
+      },
+      add_activity: {
+        label: "Log activity",
+        description: "Attach a new activity to this contact",
+        params: {
+          type: { type: "string", description: "call | email | meeting | note" },
+          subject: { type: "string", description: "Activity subject" },
+          body: { type: "string", description: "Activity body" },
+        },
+        handler: async (p) => {
+          const a = store.addActivity({
+            contactId: c.id,
+            dealId: null,
+            type: String(p.type) as ActivityType,
+            subject: String(p.subject),
+            body: String(p.body),
+          });
+          slop.refresh();
+          return { id: a.id };
+        },
+      },
+      delete: {
+        label: "Delete contact",
+        description: "Delete this contact",
+        params: {},
+        handler: async () => {
+          store.deleteContact(c.id);
+          slop.refresh();
+          return { deleted: c.id };
+        },
+      },
+    },
+  };
+}
+
+function buildDealNode(store: CrmStore, slop: SlopServer, d: Deal, salience?: number): NodeDescriptor {
+  const actions: NonNullable<NodeDescriptor["actions"]> = {
+    edit_value: {
+      label: "Edit value",
+      description: "Set the deal's USD value",
+      params: { value: { type: "number", description: "New value in USD" } },
+      handler: async (p) => {
+        store.setDealValue(d.id, Number(p.value));
+        slop.refresh();
+        return { id: d.id };
+      },
+    },
+    add_activity: {
+      label: "Log activity",
+      description: "Attach a new activity to this deal",
+      params: {
+        type: { type: "string", description: "call | email | meeting | note" },
+        subject: { type: "string", description: "Activity subject" },
+        body: { type: "string", description: "Activity body" },
+      },
+      handler: async (p) => {
+        const a = store.addActivity({
+          contactId: null,
+          dealId: d.id,
+          type: String(p.type) as ActivityType,
+          subject: String(p.subject),
+          body: String(p.body),
+        });
+        slop.refresh();
+        return { id: a.id };
+      },
+    },
+    delete: {
+      label: "Delete deal",
+      description: "Delete this deal",
+      params: {},
+      handler: async () => {
+        store.deleteDeal(d.id);
+        slop.refresh();
+        return { deleted: d.id };
+      },
+    },
+  };
+
+  // State-dependent stage transitions
+  const stageTargets: Record<DealStage, DealStage[]> = {
+    lead: ["qualified", "lost"],
+    qualified: ["proposal", "lost"],
+    proposal: ["won", "lost"],
+    won: [],
+    lost: [],
+  };
+  for (const target of stageTargets[d.stage]) {
+    const actionName = `mark_${target}`;
+    actions[actionName] = {
+      label: `Mark ${target}`,
+      description: `Advance this deal to stage "${target}"`,
+      params: {},
+      handler: async () => {
+        store.advanceStage(d.id, target);
+        slop.refresh();
+        return { id: d.id, stage: target };
+      },
+    };
+  }
+
+  const node: NodeDescriptor = {
+    type: "crm:deal",
+    props: {
+      contact_id: d.contactId,
+      title: d.title,
+      value_usd: d.valueUsd,
+      stage: d.stage,
+      activity_count: store.activitiesForDeal(d.id).length,
+    },
+    actions,
+  };
+  if (salience !== undefined) node.meta = { salience };
+  return node;
+}
+
+function buildActivityNode(store: CrmStore, slop: SlopServer, a: Activity): NodeDescriptor {
+  return {
+    type: "crm:activity",
+    props: {
+      type: a.type,
+      subject: a.subject,
+      body: a.body,
+      contact_id: a.contactId ?? "",
+      deal_id: a.dealId ?? "",
+    },
+    actions: {
+      delete: {
+        label: "Delete activity",
+        description: "Delete this activity",
+        params: {},
+        handler: async () => {
+          store.deleteActivity(a.id);
+          slop.refresh();
+          return { deleted: a.id };
+        },
+      },
+    },
+  };
+}
+
+export function startCrmSlopServer(store: CrmStore, port: number, opts?: CrmSlopOpts) {
+  const slop = createCrmSlopServer(store, opts);
+  const handler = bunHandler(slop, { path: "/slop" });
+  const server = Bun.serve({
+    port,
+    fetch(req, srv) {
+      const resp = handler.fetch(req, srv);
+      if (resp) return resp;
+      return new Response("SLOP CRM benchmark server", { status: 200 });
+    },
+    websocket: handler.websocket,
+  });
+  return { server, slop };
+}
diff --git a/benchmarks/v2/apps/crm/store.ts b/benchmarks/v2/apps/crm/store.ts
new file mode 100644
index 0000000..01ff990
--- /dev/null
+++ b/benchmarks/v2/apps/crm/store.ts
@@ -0,0 +1,94 @@
+export type DealStage = "lead" | "qualified" | "proposal" | "won" | "lost";
+export type ActivityType = "call" | "email" | "meeting" | "note";
+
+export interface Contact {
+  id: string;
+  name: string;
+  company: string;
+  email: string;
+  role: string;
+}
+
+export interface Deal {
+  id: string;
+  contactId: string;
+  title: string;
+  valueUsd: number;
+  stage: DealStage;
+}
+
+export interface Activity {
+  id: string;
+  contactId: string | null;
+  dealId: string | null;
+  type: ActivityType;
+  subject: string;
+  body: string;
+}
+
+export class CrmStore {
+  contacts: Contact[] = [];
+  deals: Deal[] = [];
+  activities: Activity[] = [];
+
+  reset(contacts: Contact[], deals: Deal[], activities: Activity[]) {
+    this.contacts = contacts.map((c) => ({ ...c }));
+    this.deals = deals.map((d) => ({ ...d }));
+    this.activities = activities.map((a) => ({ ...a }));
+  }
+
+  getContact(id: string): Contact | undefined {
+    return this.contacts.find((c) => c.id === id);
+  }
+  getDeal(id: string): Deal | undefined {
+    return this.deals.find((d) => d.id === id);
+  }
+  getActivity(id: string): Activity | undefined {
+    return this.activities.find((a) => a.id === id);
+  }
+
+  advanceStage(dealId: string, stage: DealStage): Deal {
+    const d = this.getDeal(dealId);
+    if (!d) throw new Error(`deal ${dealId} not found`);
+    d.stage = stage;
+    return d;
+  }
+
+  setDealValue(dealId: string, valueUsd: number): Deal {
+    const d = this.getDeal(dealId);
+    if (!d) throw new Error(`deal ${dealId} not found`);
+    d.valueUsd = valueUsd;
+    return d;
+  }
+
+  addActivity(a: Omit<Activity, "id"> & { id?: string }): Activity {
+    const id = a.id ?? `act-${this.activities.length + 1}`;
+    const activity: Activity = { id, ...a };
+    this.activities.push(activity);
+    return activity;
+  }
+
+  deleteContact(id: string): void {
+    this.contacts = this.contacts.filter((c) => c.id !== id);
+  }
+
+  deleteDeal(id: string): void {
+    this.deals = this.deals.filter((d) => d.id !== id);
+  }
+
+  deleteActivity(id: string): void {
+    this.activities = this.activities.filter((a) => a.id !== id);
+  }
+
+  dealsForContact(contactId: string): Deal[] {
+    return this.deals.filter((d) => d.contactId === contactId);
+  }
+
+  activitiesForContact(contactId: string): Activity[] {
+    return this.activities.filter((a) => a.contactId === contactId);
+  }
+
+  activitiesForDeal(dealId: string): Activity[] {
+    return this.activities.filter((a) => a.dealId === dealId);
+  }
+}
diff --git a/benchmarks/v2/apps/file-browser/index.ts b/benchmarks/v2/apps/file-browser/index.ts
new file mode 100644
index 0000000..5f364b6
--- /dev/null
+++ b/benchmarks/v2/apps/file-browser/index.ts
@@ -0,0 +1,106 @@
+import { Client } from "@modelcontextprotocol/sdk/client";
+import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
+import { FileBrowserStore } from "./store.ts";
+import { seedFileBrowser } from "./seed.ts";
+import { startFileBrowserSlopServer, type FileBrowserSlopOpts } from "./slop-server.ts";
+import { fileBrowserScenarios } from "./scenarios.ts";
+import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "../registry.ts";
+import type { DataScale } from "../../runner/types.ts";
+import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts";
+
+function wrap(inner: FileBrowserStore): AppStore & { inner: FileBrowserStore } {
+  return { __brand: "app-store", inner } as AppStore & { inner: FileBrowserStore };
+}
+
+export const fileBrowserApp: AppBinding = {
+  id: "file-browser",
+  supportedScales: ["s", "m", "l", "xl"],
+  createStore(scale, seed) {
+    const store = new FileBrowserStore();
+    const { dirs, files } = seedFileBrowser(scale, seed);
+    store.reset(dirs, files);
+    return wrap(store);
+  },
+  async startSlopServer(store, port, opts): Promise<SlopServerHandle> {
+    const inner = (store as unknown as { inner: FileBrowserStore }).inner;
+    const { server, slop } = startFileBrowserSlopServer(inner, port, opts as FileBrowserSlopOpts | undefined);
+    return {
+      wsUrl: `ws://localhost:${port}/slop`,
+      stop: async () => {
+        slop.stop();
+        server.stop();
+      },
+    };
+  },
+  scenarios: fileBrowserScenarios,
+  verify(store, scenario) {
+    if (!scenario.verify) return undefined;
+    const inner = (store as unknown as { inner: FileBrowserStore }).inner;
+    return scenario.verify(inner as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
+  },
+  mcpSystemPrompt:
+    "You are a file browser agent. You have tools to navigate a directory tree, read files, and mutate the tree. " +
+    "Start by calling list_dir on '/' to see the root. " +
+    'When the task is complete, respond with "DONE".',
+  async startMcpServer(scale: DataScale, _variant: string): Promise<McpServerHandle> {
+    // All current MCP variants share the flat server; prompt-level variants
+    // are applied by the cell runner via resolveMcpVariant.
+    const env: Record<string, string> = { ...process.env } as Record<string, string>;
+    env.BENCH_SCALE = scale;
+    env.BENCH_SEED = String(42);
+    const serverPath = new URL("./mcp-server.ts", import.meta.url).pathname;
+    const transport = new StdioClientTransport({
+      command: "bun",
+      args: ["run", serverPath],
+      env,
+    });
+    const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" });
+    await client.connect(transport);
+    return {
+      client,
+      stop: async () => {
+        await client.close();
+      },
+      verify: async (scenario: Scenario): Promise<VerificationResult | undefined> => {
+        if (!scenario.verify) return undefined;
+        // Rebuild a FileBrowserStore by listing every dir and file.
+        const [dRes, fRes] = await Promise.all([
+          client.callTool({ name: "list_all_dirs", arguments: {} }),
+          client.callTool({ name: "list_all_files", arguments: {} }),
+        ]);
+        const dirs = parseJson(dRes) as Array<{ path: string; child_dirs: number; child_files: number }>;
+        const files = parseJson(fRes) as Array<{ path: string; name: string; size_bytes: number }>;
+        // We need full dir relationships to verify "is empty". Do one more
+        // pass per dir to get their children.
+        const tempStore = new FileBrowserStore();
+        const fullDirs = await Promise.all(
+          dirs.map(async (d) => {
+            const listRes = await client.callTool({ name: "list_dir", arguments: { path: d.path } });
+            const listed = parseJson(listRes) as { dirs?: Array<{ path: string }>; files?: Array<{ path: string }> };
+            return {
+              path: d.path,
+              name: d.path === "/" ? "" : d.path.slice(d.path.lastIndexOf("/") + 1),
+              dirs: (listed.dirs ?? []).map((x) => x.path),
+              files: (listed.files ?? []).map((x) => x.path),
+            };
+          }),
+        );
+        tempStore.reset(
+          fullDirs,
+          files.map((f) => ({ path: f.path, name: f.name, sizeBytes: f.size_bytes, content: "" })),
+        );
+        return scenario.verify(tempStore as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
+      },
+    };
+  },
+};
+
+function parseJson(result: unknown): unknown {
+  const content = (result as { content?: Array<{ type: string; text?: string }> }).content ?? [];
+  const text = content.find((c) => c.type === "text")?.text ?? "[]";
+  try {
+    return JSON.parse(text);
+  } catch {
+    return [];
+  }
+}
diff --git a/benchmarks/v2/apps/file-browser/mcp-server.ts b/benchmarks/v2/apps/file-browser/mcp-server.ts
new file mode 100644
index 0000000..a1e8a73
--- /dev/null
+++ b/benchmarks/v2/apps/file-browser/mcp-server.ts
@@ -0,0 +1,89 @@
+/**
+ * Stdio MCP server for the file-browser benchmark app.
+ * Env vars: BENCH_SCALE (s|m|l|xl), BENCH_SEED (int).
+ */
+import { Server } from "@modelcontextprotocol/sdk/server";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
+import { FileBrowserStore } from "./store.ts";
+import { seedFileBrowser } from "./seed.ts";
+import type { DataScale } from "../../runner/types.ts";
+
+const scale = (process.env.BENCH_SCALE as DataScale | undefined) ?? "s";
+const seed = Number(process.env.BENCH_SEED ?? 42);
+
+const store = new FileBrowserStore();
+const { dirs, files } = seedFileBrowser(scale, seed);
+store.reset(dirs, files);
+
+const server = new Server({ name: "file-browser-mcp", version: "0.2.0" }, { capabilities: { tools: {} } });
+
+server.setRequestHandler(ListToolsRequestSchema, async () => ({
+  tools: [
+    { name: "list_dir", description: "List the direct children (dirs and files) of a directory", inputSchema: { type: "object" as const, properties: { path: { type: "string", description: "Directory path, e.g. / or /src" } }, required: ["path"] } },
+    { name: "list_all_dirs", description: "List every directory in the tree, recursively", inputSchema: { type: "object" as const, properties: {} } },
+    { name: "list_all_files", description: "List every file in the tree, recursively", inputSchema: { type: "object" as const, properties: {} } },
+    { name: "read_file", description: "Return a file's full contents", inputSchema: { type: "object" as const, properties: { path: { type: "string", description: "File path" } }, required: ["path"] } },
+    { name: "delete_file", description: "Delete a file", inputSchema: { type: "object" as const, properties: { path: { type: "string" } }, required: ["path"] } },
+    { name: "delete_dir", description: "Delete a directory (must be empty)", inputSchema: { type: "object" as const, properties: { path: { type: "string" } }, required: ["path"] } },
+    { name: "create_dir", description: "Create a new empty directory as a child of another", inputSchema: { type: "object" as const, properties: { parent: { type: "string" }, name: { type: "string" } }, required: ["parent", "name"] } },
+    { name: "rename_file", description: "Rename a file (keeps it in the same directory)", inputSchema: { type: "object" as const, properties: { path: { type: "string" }, new_name: { type: "string" } }, required: ["path", "new_name"] } },
+    { name: "move_file", description: "Move a file into another directory", inputSchema: { type: "object" as const, properties: { path: { type: "string" }, new_parent: { type: "string" } }, required: ["path", "new_parent"] } },
+  ],
+}));
+
+server.setRequestHandler(CallToolRequestSchema, async (req) => {
+  const { name, arguments: args } = req.params;
+  const a = (args ?? {}) as Record<string, unknown>;
+  try {
+    switch (name) {
+      case "list_dir": {
+        const d = store.getDir(String(a.path));
+        if (!d) return err(`dir ${a.path} not found`);
+        return json({
+          path: d.path,
+          dirs: d.dirs.map((p) => ({ path: p, name: store.getDir(p)?.name, is_empty: store.isDirEmpty(p) })),
+          files: d.files.map((p) => ({ path: p, name: store.getFile(p)?.name, size_bytes: store.getFile(p)?.sizeBytes })),
+        });
+      }
+      case "list_all_dirs":
+        return json(store.listDirs().map((d) => ({ path: d.path, is_empty: store.isDirEmpty(d.path), child_dirs: d.dirs.length, child_files: d.files.length })));
+      case "list_all_files":
+        return json(store.listFiles().map((f) => ({ path: f.path, name: f.name, size_bytes: f.sizeBytes })));
+      case "read_file": {
+        const f = store.getFile(String(a.path));
+        if (!f) return err(`file ${a.path} not found`);
+        return json({ path: f.path, content: f.content });
+      }
+      case "delete_file":
+        store.deleteFile(String(a.path));
+        return json({ deleted: a.path });
+      case "delete_dir":
+        store.deleteDir(String(a.path));
+        return json({ deleted: a.path });
+      case "create_dir":
+        store.createDir(String(a.parent), String(a.name));
+        return json({ created: `${a.parent}/${a.name}` });
+      case "rename_file":
+        store.renameFile(String(a.path), String(a.new_name));
+        return json({ renamed: a.path, to: a.new_name });
+      case "move_file":
+        store.moveFile(String(a.path), String(a.new_parent));
+        return json({ moved: a.path, to: a.new_parent });
+      default:
+        return err(`unknown tool ${name}`);
+    }
+  } catch (e) {
+    return err(e instanceof Error ? e.message : String(e));
+  }
+});
+
+function json(data: unknown) {
+  return { content: [{ type: "text", text: JSON.stringify(data) }] };
+}
+function err(msg: string) {
+  return { content: [{ type: "text", text: JSON.stringify({ error: msg }) }], isError: true };
+}
+
+const transport = new StdioServerTransport();
+await server.connect(transport);
diff --git a/benchmarks/v2/apps/file-browser/scenarios.ts b/benchmarks/v2/apps/file-browser/scenarios.ts
new file mode 100644
index 0000000..785624e
--- /dev/null
+++ b/benchmarks/v2/apps/file-browser/scenarios.ts
@@ -0,0 +1,68 @@
+import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts";
+import type { FileBrowserStore } from "./store.ts";
+
+const empty: Scenario["steps"] = [];
+
+/**
+ * find-readme — tests depth-first exploration. The agent must locate a file
+ * named README.md and report its contents. We verify by checking the store's
+ * "reports" tracking (or simpler: the scenario's `verify` just confirms the
+ * file still exists; we don't validate the agent's final answer. The real
+ * test is whether the agent *can* read it at all, reflected in toolCalls /
+ * specComplianceRate metrics).
+ */
+function verifyReadmeExists(store: FileBrowserStore): VerificationResult {
+  // Placeholder verifier — the whole point of find-and-read is that the
+  // agent reaches the file, which is captured by tool-call metrics, not by
+  // store mutations. Scoring happens via a reports collection below.
+  const readme = store.getFile("/README.md");
+  return {
+    passed: readme !== undefined,
+    checks: [
+      {
+        name: "README.md still in tree",
+        passed: readme !== undefined,
+      },
+    ],
+  };
+}
+
+/**
+ * delete-empty-dirs — tests state-dependent affordances. The agent must find
+ * every empty directory and delete it. The seed guarantees at least one
+ * empty dir ("/empty"). Verifier checks that no empty dirs remain.
+ */
+function verifyNoEmptyDirs(store: FileBrowserStore): VerificationResult {
+  const empties = store.listDirs().filter((d) => d.path !== "/" && store.isDirEmpty(d.path));
+  return {
+    passed: empties.length === 0,
+    checks: [
+      {
+        name: "no empty directories remain",
+        passed: empties.length === 0,
+        detail: empties.length === 0 ? undefined : `still empty: ${empties.map((d) => d.path).join(", ")}`,
+      },
+    ],
+  };
+}
+
+export const fileBrowserScenarios: Scenario[] = [
+  {
+    name: "find-readme",
+    description: "Locate and read the contents of README.md.",
+    agentPrompt:
+      "Find the file named exactly 'README.md' in the file tree and read its contents. " +
+      'Then respond with "DONE".',
+    steps: empty,
+    verify: (store) => verifyReadmeExists(store as unknown as FileBrowserStore),
+  },
+  {
+    name: "delete-empty-dirs",
+    description: "Delete every empty directory (excluding the root).",
+    agentPrompt:
+      "Delete every empty directory in the tree. An empty directory contains no files and no subdirectories. " +
+      "Do not delete the root directory. Do not delete any directory that still has files or subdirectories.",
+    steps: empty,
+    verify: (store) => verifyNoEmptyDirs(store as unknown as FileBrowserStore),
+  },
+];
diff --git a/benchmarks/v2/apps/file-browser/seed.ts b/benchmarks/v2/apps/file-browser/seed.ts
new file mode 100644
index 0000000..bf7dfbd
--- /dev/null
+++ b/benchmarks/v2/apps/file-browser/seed.ts
@@ -0,0 +1,106 @@
+import type { DataScale } from "../../runner/types.ts";
+import type { DirNode, FileNode } from "./store.ts";
+import { joinPath } from "./store.ts";
+
+/**
+ * Deterministic file-tree seed. Shape grows both wider and deeper with
+ * scale so the size-vs-depth axis is exercised.
+ *
+ * - s: depth 2, ~8 files
+ * - m: depth 3, ~30 files
+ * - l: depth 4, ~120 files
+ * - xl: depth 5, ~500 files
+ *
+ * Guarantees (for scenario verifiers):
+ * - Exactly one file named "README.md" somewhere in the tree
+ * - At least one empty directory
+ * - At least 3 files with ".log" extension
+ */
+const SHAPES: Record<DataScale, { depth: number; dirsPerLevel: number; filesPerDir: number }> = {
+  s: { depth: 2, dirsPerLevel: 2, filesPerDir: 2 },
+  m: { depth: 3, dirsPerLevel: 3, filesPerDir: 2 },
+  l: { depth: 4, dirsPerLevel: 3, filesPerDir: 3 },
+  xl: { depth: 5, dirsPerLevel: 3, filesPerDir: 4 },
+};
+
+const NAMES = ["src", "lib", "tests", "docs", "assets", "build", "dist", "config", "scripts", "examples"];
+const FILE_NAMES = ["main.ts", "util.ts", "index.html", "styles.css", "data.json", "notes.md", "debug.log"];
+
+function makeRng(seed: number) {
+  let x = seed || 0x3cafeba;
+  return () => {
+    x ^= x << 13;
+    x ^= x >>> 17;
+    x ^= x << 5;
+    return ((x >>> 0) % 1_000_000) / 1_000_000;
+  };
+}
+
+export function seedFileBrowser(scale: DataScale, seed: number): { dirs: DirNode[]; files: FileNode[] } {
+  const rng = makeRng(seed);
+  const shape = SHAPES[scale];
+  const dirs = new Map<string, DirNode>();
+  const files: FileNode[] = [];
+
+  // Root
+  const root: DirNode = { path: "/", name: "", dirs: [], files: [] };
+  dirs.set("/", root);
+
+  // README.md at the root — guarantees a discoverable file for find-and-read.
+  const readme: FileNode = {
+    path: "/README.md",
+    name: "README.md",
+    sizeBytes: 128,
+    content: "SLOP benchmarks v2 — file-browser sample tree. Look for README at the root.",
+  };
+  files.push(readme);
+  root.files.push(readme.path);
+
+  buildLevel(root, 0, shape, rng, dirs, files);
+
+  // Guarantee an empty dir at the root: one named "empty".
+  const empty: DirNode = { path: "/empty", name: "empty", dirs: [], files: [] };
+  dirs.set(empty.path, empty);
+  root.dirs.push(empty.path);
+
+  // Guarantee at least 3 .log files — seed them under the first child dir.
+  const firstChild = dirs.get(root.dirs.find((p) => p !== "/empty") ?? "/empty");
+  if (firstChild) {
+    for (let i = 0; i < 3; i++) {
+      const name = `run-${i + 1}.log`;
+      const path = joinPath(firstChild.path, name);
+      files.push({ path, name, sizeBytes: 512, content: `log output ${i + 1}` });
+      firstChild.files.push(path);
+    }
+  }
+
+  return { dirs: Array.from(dirs.values()), files };
+}
+
+function buildLevel(
+  parent: DirNode,
+  depth: number,
+  shape: { depth: number; dirsPerLevel: number; filesPerDir: number },
+  rng: () => number,
+  dirs: Map<string, DirNode>,
+  files: FileNode[],
+) {
+  if (depth >= shape.depth) return;
+  const dirCount = Math.max(1, Math.round(shape.dirsPerLevel * (0.7 + rng() * 0.6)));
+  for (let i = 0; i < dirCount; i++) {
+    const name = `${NAMES[Math.floor(rng() * NAMES.length)]}-${i + 1}`;
+    const path = joinPath(parent.path, name);
+    const dir: DirNode = { path, name, dirs: [], files: [] };
+    dirs.set(path, dir);
+    parent.dirs.push(path);
+    // Add files to this directory
+    const fileCount = Math.max(1, Math.round(shape.filesPerDir * (0.6 + rng() * 0.8)));
+    for (let j = 0; j < fileCount; j++) {
+      const fileName = `${FILE_NAMES[Math.floor(rng() * FILE_NAMES.length)].replace(/\.(\w+)$/, `-${j + 1}.$1`)}`;
+      const fpath = joinPath(path, fileName);
+      files.push({ path: fpath, name: fileName, sizeBytes: 256 + Math.floor(rng() * 2048), content: `// file ${fpath}` });
+      dir.files.push(fpath);
+    }
+    buildLevel(dir, depth + 1, shape, rng, dirs, files);
+  }
+}
diff --git a/benchmarks/v2/apps/file-browser/slop-server.ts b/benchmarks/v2/apps/file-browser/slop-server.ts
new file mode 100644
index 0000000..a29acfd
--- /dev/null
+++ b/benchmarks/v2/apps/file-browser/slop-server.ts
@@ -0,0 +1,195 @@
+import { SlopServer } from "@slop-ai/server";
+import { bunHandler } from "@slop-ai/server/bun";
+import type { NodeDescriptor } from "@slop-ai/core";
+import type { DirNode, FileBrowserStore, FileNode } from "./store.ts";
+
+export interface FileBrowserSlopOpts {
+  maxNodes?: number;
+  maxDepth?: number;
+  /**
+   * optimized=true: directories beyond depth 2 become lazy stubs (no inline
+   * children; agent must slop_query them). off=false inlines the whole tree.
+   */
+  optimized?: boolean;
+}
+
+/**
+ * The path-to-SLOP mapping strips the leading "/" and uses the result as
+ * the register key. Root ("/") is registered as "tree" so there's always a
+ * canonical entry point above the first real directory.
+ */
+function slopPath(storePath: string): string {
+  if (storePath === "/") return "tree";
+  return `tree${storePath}`;
+}
+
+export function createFileBrowserSlopServer(store: FileBrowserStore, opts?: FileBrowserSlopOpts) {
+  const slop = new SlopServer({
+    id: "file-browser",
+    name: "File Browser",
+    ...(opts?.maxNodes != null && { maxNodes: opts.maxNodes }),
+    ...(opts?.maxDepth != null && { maxDepth: opts.maxDepth }),
+  });
+
+  const optimized = opts?.optimized ?? false;
+
+  slop.register("overview", () => {
+    const totalDirs = store.listDirs().length;
+    const totalFiles = store.listFiles().length;
+    const emptyDirs = store.listDirs().filter((d) => d.path !== "/" && store.isDirEmpty(d.path)).length;
+    return {
+      type: "context",
+      props: {
+        total_dirs: totalDirs,
+        total_files: totalFiles,
+        empty_dirs: emptyDirs,
+      },
+      summary: `${totalDirs} directories (${emptyDirs} empty), ${totalFiles} files`,
+    };
+  });
+
+  const registerDir = (dir: DirNode, depth: number) => {
+    slop.register(slopPath(dir.path), () => {
+      const current = store.getDir(dir.path);
+      if (!current) return { type: "missing" } satisfies NodeDescriptor;
+      const isDeep = optimized && depth >= 2;
+      const children: Record<string, NodeDescriptor> = {};
+      if (!isDeep) {
+        for (const childDirPath of current.dirs) {
+          const child = store.getDir(childDirPath);
+          if (child) children[child.name] = buildDirStub(child, store);
+        }
+        for (const filePath of current.files) {
+          const file = store.getFile(filePath);
+          if (file) children[file.name] = buildFileNode(store, slop, file);
+        }
+      }
+      const node: NodeDescriptor = {
+        type: "dir",
+        props: {
+          path: current.path,
+          child_dirs: current.dirs.length,
+          child_files: current.files.length,
+          is_empty: store.isDirEmpty(current.path),
+        },
+        summary: isDeep
+          ? `${current.dirs.length} subdirs, ${current.files.length} files (lazy — use slop_query to load)`
+          : undefined,
+        actions: {
+          create_file: {
+            label: "Create file",
+            description: "Create a new file inside this directory",
+            params: {
+              name: { type: "string", description: "New file name" },
+              content: { type: "string", description: "File contents" },
+            },
+            handler: async (p) => {
+              // No first-class createFile on the store — not needed by current scenarios.
+              slop.refresh();
+              return { error: "create_file not supported" };
+            },
+          },
+          create_subdir: {
+            label: "Create subdirectory",
+            description: "Create a new empty directory inside this one",
+            params: { name: { type: "string", description: "New directory name" } },
+            handler: async (p) => {
+              store.createDir(current.path, String(p.name));
+              slop.refresh();
+              return { id: current.path };
+            },
+          },
+        },
+      };
+      // State-dependent: delete is only available when the dir is empty and non-root.
+      if (current.path !== "/" && store.isDirEmpty(current.path)) {
+        node.actions!.delete = {
+          label: "Delete empty directory",
+          description: "Delete this directory (only available when empty)",
+          params: {},
+          handler: async () => {
+            store.deleteDir(current.path);
+            slop.refresh();
+            return { deleted: current.path };
+          },
+        };
+      }
+      if (!isDeep) {
+        node.children = children;
+      }
+      return node;
+    });
+
+    if (optimized && depth >= 2) return;
+    for (const childPath of dir.dirs) {
+      const child = store.getDir(childPath);
+      if (child) registerDir(child, depth + 1);
+    }
+  };
+
+  const root = store.getDir("/");
+  if (root) registerDir(root, 0);
+
+  return slop;
+}
+
+function buildDirStub(dir: DirNode, store: FileBrowserStore): NodeDescriptor {
+  return {
+    type: "dir-stub",
+    props: {
+      path: dir.path,
+      name: dir.name,
+      is_empty: store.isDirEmpty(dir.path),
+    },
+    summary: `${dir.dirs.length} subdirs, ${dir.files.length} files`,
+  };
+}
+
+function buildFileNode(store: FileBrowserStore, slop: SlopServer, file: FileNode): NodeDescriptor {
+  return {
+    type: "file",
+    props: {
+      path: file.path,
+      name: file.name,
+      size_bytes: file.sizeBytes,
+      // Small preview only — the real content comes via the read_file affordance.
+      preview: file.content.slice(0, 80),
+    },
+    actions: {
+      read_file: {
+        label: "Read file",
+        description: "Return the full contents of this file",
+        params: {},
+        handler: async () => {
+          const current = store.getFile(file.path);
+          return { path: file.path, content: current?.content ?? "" };
+        },
+      },
+      delete_file: {
+        label: "Delete file",
+        description: "Delete this file",
+        params: {},
+        handler: async () => {
+          store.deleteFile(file.path);
+          slop.refresh();
+          return { deleted: file.path };
+        },
+      },
+    },
+  };
+}
+
+export function startFileBrowserSlopServer(store: FileBrowserStore, port: number, opts?: FileBrowserSlopOpts) {
+  const slop = createFileBrowserSlopServer(store, opts);
+  const handler = bunHandler(slop, { path: "/slop" });
+  const server = Bun.serve({
+    port,
+    fetch(req, srv) {
+      const resp = handler.fetch(req, srv);
+      if (resp) return resp;
+      return new Response("SLOP File Browser benchmark server", { status: 200 });
+    },
+    websocket: handler.websocket,
+  });
+  return { server, slop };
+}
diff --git a/benchmarks/v2/apps/file-browser/store.ts b/benchmarks/v2/apps/file-browser/store.ts
new file mode 100644
index 0000000..ad1cd52
--- /dev/null
+++ b/benchmarks/v2/apps/file-browser/store.ts
@@ -0,0 +1,122 @@
+export interface FileNode {
+  path: string;
+  name: string;
+  sizeBytes: number;
+  content: string;
+}
+
+export interface DirNode {
+  path: string;
+  name: string;
+  dirs: string[];   // child dir paths
+  files: string[];  // child file paths
+}
+
+/**
+ * Deep tree store. Paths are unix-style absolute strings rooted at "/".
+ * All operations are path-indexed so the SLOP server can register nodes at
+ * arbitrary depths without tracking parent pointers separately.
+ */
+export class FileBrowserStore {
+  dirs = new Map<string, DirNode>();
+  files = new Map<string, FileNode>();
+
+  reset(dirs: DirNode[], files: FileNode[]) {
+    this.dirs.clear();
+    this.files.clear();
+    for (const d of dirs) this.dirs.set(d.path, { ...d, dirs: [...d.dirs], files: [...d.files] });
+    for (const f of files) this.files.set(f.path, { ...f });
+  }
+
+  getDir(path: string): DirNode | undefined {
+    return this.dirs.get(path);
+  }
+
+  getFile(path: string): FileNode | undefined {
+    return this.files.get(path);
+  }
+
+  listDirs(): DirNode[] {
+    return Array.from(this.dirs.values());
+  }
+
+  listFiles(): FileNode[] {
+    return Array.from(this.files.values());
+  }
+
+  isDirEmpty(path: string): boolean {
+    const d = this.dirs.get(path);
+    if (!d) return false;
+    return d.dirs.length === 0 && d.files.length === 0;
+  }
+
+  deleteFile(path: string): void {
+    const f = this.files.get(path);
+    if (!f) throw new Error(`file ${path} not found`);
+    this.files.delete(path);
+    const parent = this.dirs.get(parentDir(path));
+    if (parent) parent.files = parent.files.filter((p) => p !== path);
+  }
+
+  deleteDir(path: string): void {
+    const d = this.dirs.get(path);
+    if (!d) throw new Error(`dir ${path} not found`);
+    if (!this.isDirEmpty(path)) throw new Error(`dir ${path} is not empty`);
+    this.dirs.delete(path);
+    const parent = this.dirs.get(parentDir(path));
+    if (parent) parent.dirs = parent.dirs.filter((p) => p !== path);
+  }
+
+  renameFile(path: string, newName: string): FileNode {
+    const f = this.files.get(path);
+    if (!f) throw new Error(`file ${path} not found`);
+    const parent = parentDir(path);
+    const newPath = joinPath(parent, newName);
+    if (this.files.has(newPath) || this.dirs.has(newPath)) throw new Error(`path ${newPath} already exists`);
+    this.files.delete(path);
+    const updated: FileNode = { ...f, path: newPath, name: newName };
+    this.files.set(newPath, updated);
+    const parentNode = this.dirs.get(parent);
+    if (parentNode) parentNode.files = parentNode.files.map((p) => (p === path ? newPath : p));
+    return updated;
+  }
+
+  moveFile(path: string, newParentPath: string): FileNode {
+    const f = this.files.get(path);
+    if (!f) throw new Error(`file ${path} not found`);
+    const newParent = this.dirs.get(newParentPath);
+    if (!newParent) throw new Error(`dir ${newParentPath} not found`);
+    const oldParent = this.dirs.get(parentDir(path));
+    const newPath = joinPath(newParentPath, f.name);
+    if (this.files.has(newPath) || this.dirs.has(newPath)) throw new Error(`path ${newPath} already exists`);
+    this.files.delete(path);
+    const updated: FileNode = { ...f, path: newPath };
+    this.files.set(newPath, updated);
+    if (oldParent) oldParent.files = oldParent.files.filter((p) => p !== path);
+    newParent.files.push(newPath);
+    return updated;
+  }
+
+  createDir(parentPath: string, name: string): DirNode {
+    const parent = this.dirs.get(parentPath);
+    if (!parent) throw new Error(`dir ${parentPath} not found`);
+    const newPath = joinPath(parentPath, name);
+    if (this.dirs.has(newPath) || this.files.has(newPath)) throw new Error(`path ${newPath} already exists`);
+    const d: DirNode = { path: newPath, name, dirs: [], files: [] };
+    this.dirs.set(newPath, d);
+    parent.dirs.push(newPath);
+    return d;
+  }
+}
+
+export function parentDir(path: string): string {
+  if (path === "/") return "/";
+  const idx = path.lastIndexOf("/");
+  if (idx === 0) return "/";
+  return path.slice(0, idx);
+}
+
+export function joinPath(parent: string, name: string): string {
+  if (parent === "/") return `/${name}`;
+  return `${parent}/${name}`;
+}
diff --git a/benchmarks/v2/apps/issue-tracker.ts b/benchmarks/v2/apps/issue-tracker.ts
new file mode 100644
index 0000000..8978d3e
--- /dev/null
+++ b/benchmarks/v2/apps/issue-tracker.ts
@@ -0,0 +1,134 @@
+import { Client } from "@modelcontextprotocol/sdk/client";
+import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
+import { IssueTrackerStore } from "../../mcp-vs-slop/app/store.ts";
+import { createSeedData, createLargeSeedData } from "../../mcp-vs-slop/app/seed.ts";
+import { startSlopServer, type SlopServerOpts } from "../../mcp-vs-slop/app/slop-server.ts";
+import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "./registry.ts";
+import type { DataScale } from "../runner/types.ts";
+import type { Scenario, VerificationResult } from "../../mcp-vs-slop/scenarios/types.ts";
+
+import { exploreAndAct } from "../../mcp-vs-slop/scenarios/explore-and-act.ts";
+import { triage } from "../../mcp-vs-slop/scenarios/triage.ts";
+import { bulkUpdate } from "../../mcp-vs-slop/scenarios/bulk-update.ts";
+import { scaleTriage } from "../../mcp-vs-slop/scenarios/scale-triage.ts";
+import { negative } from "../../mcp-vs-slop/scenarios/negative.ts";
+import { contextual } from "../../mcp-vs-slop/scenarios/contextual.ts";
+import { recovery } from "../../mcp-vs-slop/scenarios/recovery.ts";
+import { stateTransitions } from "../../mcp-vs-slop/scenarios/state-transitions.ts";
+import { crossEntity } from "../../mcp-vs-slop/scenarios/cross-entity.ts";
+import { conditional } from "../../mcp-vs-slop/scenarios/conditional.ts";
+import { ambiguity } from "../../mcp-vs-slop/scenarios/ambiguity.ts";
+import { complexWorkflow } from "../../mcp-vs-slop/scenarios/complex-workflow.ts";
+
+// v1 exposes only two seed sizes; we map them to the v2 scale axis. Phase F
+// will grow the app's own generators so `m` / `xl` become supported.
+function seedForScale(scale: DataScale) {
+  switch (scale) {
+    case "s":
+      return createSeedData();
+    case "l":
+      return createLargeSeedData();
+    default:
+      throw new Error(`issue-tracker: scale "${scale}" not yet supported (supported: s, l)`);
+  }
+}
+
+function wrap(inner: IssueTrackerStore): AppStore & { inner: IssueTrackerStore } {
+  return { __brand: "app-store", inner } as AppStore & { inner: IssueTrackerStore };
+}
+
+export const issueTrackerApp: AppBinding = {
+  id: "issue-tracker",
+  supportedScales: ["s", "l"],
+  createStore(scale, _seed) {
+    const store = new IssueTrackerStore();
+    store.reset(seedForScale(scale));
+    return wrap(store);
+  },
+  async startSlopServer(store, port, opts: SlopServerOpts | undefined): Promise<SlopServerHandle> {
+    const inner = (store as unknown as { inner: IssueTrackerStore }).inner;
+    const { server: httpServer, slop } = startSlopServer(inner, port, opts);
+    return {
+      wsUrl: `ws://localhost:${port}/slop`,
+      stop: async () => {
+        slop.stop();
+        httpServer.stop();
+      },
+    };
+  },
+  scenarios: [
+    exploreAndAct,
+    triage,
+    bulkUpdate,
+    scaleTriage,
+    negative,
+    contextual,
+    recovery,
+    stateTransitions,
+    crossEntity,
+    conditional,
+    ambiguity,
+    complexWorkflow,
+  ],
+  verify(store, scenario) {
+    if (!scenario.verify) return undefined;
+    const inner = (store as unknown as { inner: IssueTrackerStore }).inner;
+    return scenario.verify(inner);
+  },
+  mcpSystemPrompt:
+    "You are an issue tracker agent. You have access to tools to interact with repositories, issues, and comments. " +
+    "You have NO prior knowledge of the data — use the tools to discover the current state. " +
+    'When done, respond with "DONE".',
+  async startMcpServer(scale: DataScale, _variant: string): Promise<McpServerHandle> {
+    // All current MCP variants share the flat server; prompt-level variants
+    // are applied by the cell runner via resolveMcpVariant.
+    const env: Record<string, string> = { ...process.env } as Record<string, string>;
+    if (scale === "l") env.BENCH_LARGE_DATASET = "1";
+    else if (scale === "s") delete env.BENCH_LARGE_DATASET;
+    else throw new Error(`issue-tracker mcp: scale "${scale}" not supported`);
+
+    const serverPath = new URL("../../mcp-vs-slop/app/mcp-server.ts", import.meta.url).pathname;
+    const transport = new StdioClientTransport({
+      command: "bun",
+      args: ["run", serverPath],
+      env,
+    });
+    const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" });
+    await client.connect(transport);
+
+    return {
+      client,
+      stop: async () => {
+        await client.close();
+      },
+      verify: async (scenario: Scenario): Promise<VerificationResult | undefined> => {
+        if (!scenario.verify) return undefined;
+        const tempStore = new IssueTrackerStore();
+        const reposRes = await client.callTool({ name: "list_repos", arguments: {} });
+        const repos = parseToolJson(reposRes, []);
+        tempStore.repos = repos as IssueTrackerStore["repos"];
+        for (const repo of repos as Array<{ id: string }>) {
+          const issuesRes = await client.callTool({ name: "list_issues", arguments: { repo_id: repo.id } });
+          const issues = parseToolJson(issuesRes, []);
+          tempStore.issues.push(...(issues as IssueTrackerStore["issues"]));
+        }
+        for (const issue of tempStore.issues) {
+          const commentsRes = await client.callTool({ name: "list_comments", arguments: { issue_id: issue.id } });
+          const comments = parseToolJson(commentsRes, []);
+          tempStore.comments.push(...(comments as IssueTrackerStore["comments"]));
+        }
+        return scenario.verify(tempStore);
+      },
+    };
+  },
+};
+
+function parseToolJson(result: unknown, fallback: unknown): unknown {
+  const content = (result as { content?: Array<{ type: string; text?: string }> }).content ?? [];
+  const text = content.find((c) => c.type === "text")?.text ?? "";
+  try {
+    return JSON.parse(text);
+  } catch {
+    return fallback;
+  }
+}
diff --git a/benchmarks/v2/apps/registry.ts b/benchmarks/v2/apps/registry.ts
new file mode 100644
index 0000000..831e62d
--- /dev/null
+++ b/benchmarks/v2/apps/registry.ts
@@ -0,0 +1,66 @@
+import type { Client } from "@modelcontextprotocol/sdk/client";
+import type { SlopServerOpts } from "../../mcp-vs-slop/app/slop-server.ts";
+import type { Scenario, VerificationResult } from "../../mcp-vs-slop/scenarios/types.ts";
+import type { AppId, DataScale } from "../runner/types.ts";
+import { crmApp } from "./crm/index.ts";
+import { fileBrowserApp } from "./file-browser/index.ts";
+import { issueTrackerApp } from "./issue-tracker.ts";
+import { todoApp } from "./todo/index.ts";
+
+/**
+ * Store + server + scenarios for a given benchmark app. Each app is a tuple
+ * of (storeFactory, serverLauncher, scenarios) with a declared set of
+ * supported data scales. The sweep runner skips cells whose (app, scale)
+ * combination isn't supported.
+ */
+export interface AppBinding {
+  id: AppId;
+  supportedScales: DataScale[];
+  /** Build a fresh store seeded for the requested scale. */
+  createStore(scale: DataScale, seed: number): AppStore;
+  /** Boot a SLOP server exposing the given store. Returns stop() + URL. */
+  startSlopServer(store: AppStore, port: number, opts: SlopServerOpts | undefined): Promise<SlopServerHandle>;
+  /** Scenarios available for this app. */
+  scenarios: Scenario[];
+  /** Run the scenario's verifier against this app's store. Returns undefined if the scenario has no verifier. */
+  verify(store: AppStore, scenario: Scenario): VerificationResult | undefined;
+  /**
+   * Launch an MCP server for this app at the requested scale and return a
+   * handle. `variant` selects among fair-MCP variants (flat / flat+prompt /
+   * resources / prompts) — apps that only support `flat` may throw for the others.
+   */
+  startMcpServer?(scale: DataScale, variant: string): Promise<McpServerHandle>;
+  /** System prompt for MCP runs. Domain-specific and tuned per app. */
+  mcpSystemPrompt?: string;
+}
+
+export interface McpServerHandle {
+  client: Client;
+  stop(): Promise<void>;
+  /** Rebuild enough state from MCP tool calls to run the scenario's verifier. */
+  verify(scenario: Scenario): Promise<VerificationResult | undefined>;
+}
+
+export interface AppStore {
+  /** Unknown-by-design — each app is responsible for its own store type. */
+  readonly __brand: "app-store";
+  readonly inner: unknown;
+}
+
+export interface SlopServerHandle {
+  wsUrl: string;
+  stop(): Promise<void>;
+}
+
+const registry: Record<AppId, AppBinding | undefined> = {
+  "issue-tracker": issueTrackerApp,
+  todo: todoApp,
+  "file-browser": fileBrowserApp,
+  crm: crmApp,
+};
+
+export function resolveApp(id: AppId): AppBinding {
+  const binding = registry[id];
+  if (!binding) throw new Error(`App not yet implemented in v2: ${id}`);
+  return binding;
+}
diff --git a/benchmarks/v2/apps/todo/index.ts b/benchmarks/v2/apps/todo/index.ts
new file mode 100644
index 0000000..6988c4e
--- /dev/null
+++ b/benchmarks/v2/apps/todo/index.ts
@@ -0,0 +1,79 @@
+import { Client } from "@modelcontextprotocol/sdk/client";
+import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
+import { TodoStore } from "./store.ts";
+import { seedTodo } from "./seed.ts";
+import { startTodoSlopServer, type TodoSlopOpts } from "./slop-server.ts";
+import { todoScenarios } from "./scenarios.ts";
+import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "../registry.ts";
+import type { DataScale } from "../../runner/types.ts";
+import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts";
+
+function wrap(inner: TodoStore): AppStore & { inner: TodoStore } {
+  return { __brand: "app-store", inner } as AppStore & { inner: TodoStore };
+}
+
+export const todoApp: AppBinding = {
+  id: "todo",
+  supportedScales: ["s", "m", "l", "xl"],
+  createStore(scale, seed) {
+    const store = new TodoStore();
+    store.reset(seedTodo(scale, seed));
+    return wrap(store);
+  },
+  async startSlopServer(store, port, opts): Promise<SlopServerHandle> {
+    const inner = (store as unknown as { inner: TodoStore }).inner;
+    const { server, slop } = startTodoSlopServer(inner, port, opts as TodoSlopOpts | undefined);
+    return {
+      wsUrl: `ws://localhost:${port}/slop`,
+      stop: async () => {
+        slop.stop();
+        server.stop();
+      },
+    };
+  },
+  scenarios: todoScenarios,
+  verify(store, scenario) {
+    if (!scenario.verify) return undefined;
+    const inner = (store as unknown as { inner: TodoStore }).inner;
+    return scenario.verify(inner as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
+  },
+  mcpSystemPrompt:
+    "You are a todo-list agent. You have tools to list tasks and mutate them. " +
+    "You have no prior knowledge of the data — discover it by listing tasks. " +
+    'When the task is complete, respond with "DONE".',
+  async startMcpServer(scale: DataScale, _variant: string): Promise<McpServerHandle> {
+    // Every variant we currently ship (flat, flat+prompt) uses the same
+    // underlying stdio MCP server — only the system prompt differs, and
+    // that's handled by the cell runner via resolveMcpVariant. If a future
+    // variant needs a different server (resources, prompts), dispatch here.
+    const env: Record<string, string> = { ...process.env } as Record<string, string>;
+    env.BENCH_SCALE = scale;
+    env.BENCH_SEED = String(42);
+    const serverPath = new URL("./mcp-server.ts", import.meta.url).pathname;
+    const transport = new StdioClientTransport({
+      command: "bun",
+      args: ["run", serverPath],
+      env,
+    });
+    const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" });
+    await client.connect(transport);
+    return {
+      client,
+      stop: async () => {
+        await client.close();
+      },
+      verify: async (scenario: Scenario): Promise<VerificationResult | undefined> => {
+        if (!scenario.verify) return undefined;
+        // Reconstruct a TodoStore from one list_tasks call. No children to
+        // recurse, so this is a single round trip.
+        const res = await client.callTool({ name: "list_tasks", arguments: {} });
+        const content = (res as { content?: Array<{ type: string; text?: string }> }).content ?? [];
+        const text = content.find((c) => c.type === "text")?.text ?? "[]";
+        const tasks = JSON.parse(text);
+        const tempStore = new TodoStore();
+        tempStore.reset(tasks);
+        return scenario.verify(tempStore as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
+      },
+    };
+  },
+};
diff --git a/benchmarks/v2/apps/todo/mcp-server.ts b/benchmarks/v2/apps/todo/mcp-server.ts
new file mode 100644
index 0000000..a8bb3a0
--- /dev/null
+++ b/benchmarks/v2/apps/todo/mcp-server.ts
@@ -0,0 +1,174 @@
+/**
+ * Stdio MCP server for the todo benchmark app. Spawned as a child process by
+ * the MCP cell runner with env vars:
+ * - BENCH_SCALE = s | m | l | xl    (required)
+ * - BENCH_SEED  = integer           (required)
+ *
+ * Both are read at startup; the agent sees whatever the seed produced.
+ */
+
+import { Server } from "@modelcontextprotocol/sdk/server";
+import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
+import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
+import { TodoStore, type Priority } from "./store.ts";
+import { seedTodo } from "./seed.ts";
+import type { DataScale } from "../../runner/types.ts";
+
+const scale = (process.env.BENCH_SCALE as DataScale | undefined) ?? "s";
+const seed = Number(process.env.BENCH_SEED ?? 42);
+
+const store = new TodoStore();
+store.reset(seedTodo(scale, seed));
+
+const server = new Server({ name: "todo-mcp", version: "0.2.0" }, { capabilities: { tools: {} } });
+
+server.setRequestHandler(ListToolsRequestSchema, async () => ({
+  tools: [
+    {
+      name: "list_tasks",
+      description: "List every task in the todo app",
+      inputSchema: { type: "object" as const, properties: {} },
+    },
+    {
+      name: "get_task",
+      description: "Get a single task by id",
+      inputSchema: {
+        type: "object" as const,
+        properties: { id: { type: "string", description: "Task id" } },
+        required: ["id"],
+      },
+    },
+    {
+      name: "mark_done",
+      description: "Mark a task as done (no-op if already done)",
+      inputSchema: {
+        type: "object" as const,
+        properties: { id: { type: "string", description: "Task id" } },
+        required: ["id"],
+      },
+    },
+    {
+      name: "reopen_task",
+      description: "Mark a done task as not done (no-op if already undone)",
+      inputSchema: {
+        type: "object" as const,
+        properties: { id: { type: "string", description: "Task id" } },
+        required: ["id"],
+      },
+    },
+    {
+      name: "set_priority",
+      description: "Set a task's priority (low, medium, high)",
+      inputSchema: {
+        type: "object" as const,
+        properties: {
+          id: { type: "string", description: "Task id" },
+          priority: { type: "string", description: "low | medium | high" },
+        },
+        required: ["id", "priority"],
+      },
+    },
+    {
+      name: "set_tag",
+      description: "Set a task's tag. Empty string clears it.",
+      inputSchema: {
+        type: "object" as const,
+        properties: {
+          id: { type: "string", description: "Task id" },
+          tag: { type: "string", description: "Tag name; empty string to clear" },
+        },
+        required: ["id", "tag"],
+      },
+    },
+    {
+      name: "edit_title",
+      description: "Rename a task",
+      inputSchema: {
+        type: "object" as const,
+        properties: {
+          id: { type: "string", description: "Task id" },
+          title: { type: "string", description: "New title" },
+        },
+        required: ["id", "title"],
+      },
+    },
+    {
+      name: "delete_task",
+      description: "Delete a task permanently",
+      inputSchema: {
+        type: "object" as const,
+        properties: { id: { type: "string", description: "Task id" } },
+        required: ["id"],
+      },
+    },
+  ],
+}));
+
+server.setRequestHandler(CallToolRequestSchema, async (req) => {
+  const { name, arguments: args } = req.params;
+  const a = (args ?? {}) as Record<string, unknown>;
+  try {
+    switch (name) {
+      case "list_tasks":
+        return json(store.tasks);
+      case "get_task": {
+        const t = store.get(String(a.id));
+        return t ? json(t) : err(`task ${a.id} not found`);
+      }
+      case "mark_done": {
+        const t = store.get(String(a.id));
+        if (!t) return err(`task ${a.id} not found`);
+        store.setDone(t.id, true);
+        return json({ id: t.id, done: true });
+      }
+      case "reopen_task": {
+        const t = store.get(String(a.id));
+        if (!t) return err(`task ${a.id} not found`);
+        store.setDone(t.id, false);
+        return json({ id: t.id, done: false });
+      }
+      case "set_priority": {
+        const t = store.get(String(a.id));
+        if (!t) return err(`task ${a.id} not found`);
+        const p = String(a.priority);
+        if (!["low", "medium", "high"].includes(p)) return err(`invalid priority ${p}`);
+        store.setPriority(t.id, p as Priority);
+        return json({ id: t.id, priority: p });
+      }
+      case "set_tag": {
+        const t = store.get(String(a.id));
+        if (!t) return err(`task ${a.id} not found`);
+        const tag = String(a.tag ?? "");
+        store.setTag(t.id, tag === "" ? null : tag);
+        return json({ id: t.id, tag: tag === "" ? null : tag });
+      }
+      case "edit_title": {
+        const t = store.get(String(a.id));
+        if (!t) return err(`task ${a.id} not found`);
+        store.editTitle(t.id, String(a.title));
+        return json({ id: t.id });
+      }
+      case "delete_task": {
+        const t = store.get(String(a.id));
+        if (!t) return err(`task ${a.id} not found`);
+        store.delete(t.id);
+        return json({ deleted: t.id });
+      }
+      default:
+        return err(`unknown tool ${name}`);
+    }
+  } catch (e) {
+    return err(e instanceof Error ? e.message : String(e));
+  }
+});
+
+function json(data: unknown) {
+  return { content: [{ type: "text", text: JSON.stringify(data) }] };
+}
+
+function err(msg: string) {
+  return { content: [{ type: "text", text: JSON.stringify({ error: msg }) }], isError: true };
+}
+
+const transport = new StdioServerTransport();
+await server.connect(transport);
diff --git a/benchmarks/v2/apps/todo/scenarios.ts b/benchmarks/v2/apps/todo/scenarios.ts
new file mode 100644
index 0000000..614621a
--- /dev/null
+++ b/benchmarks/v2/apps/todo/scenarios.ts
@@ -0,0 +1,91 @@
+import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts";
+import type { TodoStore } from "./store.ts";
+
+/**
+ * Todo scenarios are deliberately simple — they test the floor of the
+ * complexity ladder. If SLOP's advantages shrink here we want to see it.
+ * Each verifier is scale-independent: it checks predicates across whatever
+ * tasks the store was seeded with, not a fixed count.
+ */
+
+// Scenarios conform to v1's Scenario type to reuse the AppBinding surface —
+// the `steps` field is only exercised in scripted mode (not used in v2 yet)
+// so we provide empty arrays.
+
+const empty: Scenario["steps"] = [];
+
+function verifyAllDone(store: TodoStore): VerificationResult {
+  const notDone = store.tasks.filter((t) => !t.done);
+  return {
+    passed: notDone.length === 0,
+    checks: [
+      {
+        name: "all tasks are done",
+        passed: notDone.length === 0,
+        detail: notDone.length === 0 ? undefined : `${notDone.length} tasks still undone`,
+      },
+    ],
+  };
+}
+
+function verifyOnlyUndoneRemain(store: TodoStore): VerificationResult {
+  const done = store.tasks.filter((t) => t.done);
+  return {
+    passed: done.length === 0,
+    checks: [
+      {
+        name: "no done tasks remain",
+        passed: done.length === 0,
+        detail: done.length === 0 ? undefined : `${done.length} done tasks were not deleted`,
+      },
+    ],
+  };
+}
+
+function verifyBugsHighPriority(store: TodoStore): VerificationResult {
+  const bugs = store.tasks.filter((t) => t.tag === "bug");
+  const nonHigh = bugs.filter((t) => t.priority !== "high");
+  // Non-bug tasks are seeded with priority <= medium, so any non-bug that's
+  // now high means the agent touched a task it shouldn't have.
+  const nonBugsPromoted = store.tasks.filter((t) => t.tag !== "bug" && t.priority === "high");
+  return {
+    passed: nonHigh.length === 0 && nonBugsPromoted.length === 0,
+    checks: [
+      {
+        name: "every bug tagged task is priority=high",
+        passed: nonHigh.length === 0,
+        detail: nonHigh.length === 0 ? undefined : `${nonHigh.length} bug tasks not high priority`,
+      },
+      {
+        name: "no non-bug tasks elevated to high",
+        passed: nonBugsPromoted.length === 0,
+        detail: nonBugsPromoted.length === 0 ? undefined : `${nonBugsPromoted.length} non-bug tasks incorrectly promoted`,
+      },
+    ],
+  };
+}
+
+export const todoScenarios: Scenario[] = [
+  {
+    name: "mark-all-done",
+    description: "Mark every task as done, touching each task exactly once.",
+    agentPrompt: "Mark every task as done. Don't skip any. Don't delete or modify anything else.",
+    steps: empty,
+    verify: (store) => verifyAllDone(store as unknown as TodoStore),
+  },
+  {
+    name: "delete-completed",
+    description: "Delete every already-done task, leaving only undone tasks.",
+    agentPrompt:
+      "Delete every task that is already marked done. Leave the undone tasks untouched.",
+    steps: empty,
+    verify: (store) => verifyOnlyUndoneRemain(store as unknown as TodoStore),
+  },
+  {
+    name: "prioritize-bugs",
+    description: "Set priority=high on every task tagged as 'bug'.",
+    agentPrompt: "For every task tagged as 'bug', set its priority to high. Don't touch any other tasks.",
+    steps: empty,
+    verify: (store) => verifyBugsHighPriority(store as unknown as TodoStore),
+  },
+];
diff --git a/benchmarks/v2/apps/todo/seed.ts b/benchmarks/v2/apps/todo/seed.ts
new file mode 100644
index 0000000..548ae1a
--- /dev/null
+++ b/benchmarks/v2/apps/todo/seed.ts
@@ -0,0 +1,63 @@
+import type { DataScale } from "../../runner/types.ts";
+import type { Priority, Task } from "./store.ts";
+
+const PRIORITIES: Priority[] = ["low", "medium", "high"];
+const TAGS = ["bug", "meeting", "errand", "read", "chore", "work", "personal"];
+const TITLES = [
+  "Fix login redirect loop",
+  "Review sprint metrics",
+  "Pick up groceries",
+  "Read new pricing RFC",
+  "Update quarterly OKRs",
+  "Call dentist",
+  "Refactor data loader",
+  "Write postmortem",
+  "Prep 1:1 agenda",
+  "Cancel old subscription",
+];
+
+const SIZES: Record<DataScale, number> = { s: 8, m: 30, l: 100, xl: 500 };
+const BUG_SHARE = 0.2; // ~20% of tasks tagged as bug
+const DONE_SHARE = 0.25;
+
+/**
+ * Deterministic seeded PRNG — xorshift32. Two runs with the same (scale, seed)
+ * produce byte-identical tasks. This is what lets the sweep reproduce cells.
+ */
+function makeRng(seed: number) {
+  let x = seed || 0x1234567;
+  return () => {
+    x ^= x << 13;
+    x ^= x >>> 17;
+    x ^= x << 5;
+    return ((x >>> 0) % 1_000_000) / 1_000_000;
+  };
+}
+
+export function seedTodo(scale: DataScale, seed: number): Task[] {
+  const rng = makeRng(seed);
+  const count = SIZES[scale];
+  const out: Task[] = [];
+  for (let i = 0; i < count; i++) {
+    const title = `${TITLES[i % TITLES.length]} #${i + 1}`;
+    const isBug = rng() < BUG_SHARE;
+    // Non-bug tasks are capped at medium so `prioritize-bugs` can check
+    // "no non-bug is high" without needing a pre-state snapshot. Bugs start
+    // at anything and the agent is asked to raise them to high.
+    const otherTags = TAGS.filter((t) => t !== "bug");
+    const tag = isBug ? "bug" : rng() < 0.7 ? otherTags[Math.floor(rng() * otherTags.length)] : null;
+    const pri = isBug
+      ? PRIORITIES[Math.floor(rng() * PRIORITIES.length)]
+      : (["low", "medium"] as Priority[])[Math.floor(rng() * 2)];
+    const done = rng() < DONE_SHARE;
+    out.push({
+      id: `task-${i + 1}`,
+      title,
+      priority: pri,
+      tag,
+      done,
+      createdAt: 1_700_000_000_000 + i * 1000,
+    });
+  }
+  return out;
+}
diff --git a/benchmarks/v2/apps/todo/slop-server.ts b/benchmarks/v2/apps/todo/slop-server.ts
new file mode 100644
index 0000000..1a87b0a
--- /dev/null
+++ b/benchmarks/v2/apps/todo/slop-server.ts
@@ -0,0 +1,184 @@
+import { SlopServer } from "@slop-ai/server";
+import { bunHandler } from "@slop-ai/server/bun";
+import type { NodeDescriptor } from "@slop-ai/core";
+import type { TodoStore, Priority, Task } from "./store.ts";
+
+export interface TodoSlopOpts {
+  maxNodes?: number;
+  maxDepth?: number;
+  /**
+   * optimized=true: windows the tasks collection to undone tasks + a rich
+   * summary, assigns salience (undone > done, high > low), and pushes done
+   * tasks behind the default window. off=false dumps every task inline.
+   */
+  optimized?: boolean;
+}
+
+export function createTodoSlopServer(store: TodoStore, opts?: TodoSlopOpts) {
+  const slop = new SlopServer({
+    id: "todo",
+    name: "Todo",
+    ...(opts?.maxNodes != null && { maxNodes: opts.maxNodes }),
+    ...(opts?.maxDepth != null && { maxDepth: opts.maxDepth }),
+  });
+
+  const optimized = opts?.optimized ?? false;
+
+  slop.register("overview", () => {
+    const done = store.tasks.filter((t) => t.done).length;
+    const undone = store.tasks.length - done;
+    const bugs = store.tasks.filter((t) => t.tag === "bug").length;
+    return {
+      type: "context",
+      props: {
+        total: store.tasks.length,
+        done,
+        undone,
+        bugs,
+      },
+      summary: `${store.tasks.length} tasks (${undone} undone, ${done} done, ${bugs} tagged bug)`,
+    };
+  });
+
+  slop.register("tasks", () => {
+    const all = store.tasks;
+    if (optimized) {
+      const done = all.filter((t) => t.done).length;
+      const undone = all.length - done;
+      return {
+        type: "collection",
+        props: { count: all.length },
+        summary: `${all.length} tasks: ${undone} undone, ${done} done.`,
+        children: Object.fromEntries(
+          [...all]
+            .map((task) => ({ task, salience: salienceFor(task) }))
+            .sort((a, b) => b.salience - a.salience)
+            .map(({ task, salience }) => [task.id, buildTaskNode(store, slop, task, salience)]),
+        ),
+      } satisfies NodeDescriptor;
+    }
+    return {
+      type: "collection",
+      props: { count: all.length },
+      children: Object.fromEntries(all.map((task) => [task.id, buildTaskNode(store, slop, task)])),
+    } satisfies NodeDescriptor;
+  });
+
+  return slop;
+}
+
+function salienceFor(t: Task): number {
+  let score = t.done ? 0.1 : 0.5;
+  if (!t.done && t.tag === "bug") score += 0.3;
+  if (t.priority === "high") score += 0.2;
+  return Math.min(1, score);
+}
+
+function buildTaskNode(
+  store: TodoStore,
+  slop: SlopServer,
+  task: Task,
+  salience?: number,
+): NodeDescriptor {
+  const actions: NonNullable<NodeDescriptor["actions"]> = {
+    edit_title: {
+      label: "Edit title",
+      description: "Rename this task",
+      params: { title: { type: "string", description: "New title" } },
+      handler: async (params) => {
+        store.editTitle(task.id, params.title as string);
+        slop.refresh();
+        return { id: task.id };
+      },
+    },
+    set_priority: {
+      label: "Set priority",
+      description: "Set task priority (low, medium, high)",
+      params: {
+        priority: { type: "string", description: "low | medium | high" },
+      },
+      handler: async (params) => {
+        store.setPriority(task.id, params.priority as Priority);
+        slop.refresh();
+        return { id: task.id };
+      },
+    },
+    set_tag: {
+      label: "Set tag",
+      description: "Assign a tag to this task (empty string clears it)",
+      params: { tag: { type: "string", description: "Tag name, or empty string to clear" } },
+      handler: async (params) => {
+        const t = String(params.tag ?? "");
+        store.setTag(task.id, t === "" ? null : t);
+        slop.refresh();
+        return { id: task.id };
+      },
+    },
+    delete: {
+      label: "Delete task",
+      description: "Delete this task permanently",
+      params: {},
+      handler: async () => {
+        store.delete(task.id);
+        slop.refresh();
+        return { deleted: task.id };
+      },
+    },
+  };
+
+  // State-dependent affordance: only expose `mark_done` when not done, and
+  // `reopen` when done — this is a key SLOP pitch so we exercise it.
+  if (task.done) {
+    actions.reopen = {
+      label: "Reopen",
+      description: "Mark this task as not done",
+      params: {},
+      handler: async () => {
+        store.setDone(task.id, false);
+        slop.refresh();
+        return { id: task.id };
+      },
+    };
+  } else {
+    actions.mark_done = {
+      label: "Mark done",
+      description: "Mark this task as done",
+      params: {},
+      handler: async () => {
+        store.setDone(task.id, true);
+        slop.refresh();
+        return { id: task.id };
+      },
+    };
+  }
+
+  const node: NodeDescriptor = {
+    type: "task",
+    props: {
+      title: task.title,
+      done: task.done,
+      priority: task.priority,
+      tag: task.tag ?? "",
+    },
+    actions,
+  };
+  if (salience !== undefined) {
+    node.meta = { salience };
+  }
+  return node;
+}
+
+export function startTodoSlopServer(store: TodoStore, port: number, opts?: TodoSlopOpts) {
+  const slop = createTodoSlopServer(store, opts);
+  const handler = bunHandler(slop, { path: "/slop" });
+  const server = Bun.serve({
+    port,
+    fetch(req, srv) {
+      const resp = handler.fetch(req, srv);
+      if (resp) return resp;
+      return new Response("SLOP Todo benchmark server", { status: 200 });
+    },
+    websocket: handler.websocket,
+  });
+  return { server, slop };
+}
diff --git a/benchmarks/v2/apps/todo/store.ts b/benchmarks/v2/apps/todo/store.ts
new file mode 100644
index 0000000..93af342
--- /dev/null
+++ b/benchmarks/v2/apps/todo/store.ts
@@ -0,0 +1,69 @@
+export type Priority = "low" | "medium" | "high";
+
+export interface Task {
+  id: string;
+  title: string;
+  done: boolean;
+  priority: Priority;
+  tag: string | null;
+  createdAt: number;
+}
+
+export class TodoStore {
+  tasks: Task[] = [];
+
+  reset(tasks: Task[]) {
+    this.tasks = tasks.map((t) => ({ ...t }));
+  }
+
+  get(id: string): Task | undefined {
+    return this.tasks.find((t) => t.id === id);
+  }
+
+  mustGet(id: string): Task {
+    const t = this.get(id);
+    if (!t) throw new Error(`Task ${id} not found`);
+    return t;
+  }
+
+  add(task: Omit<Task, "id" | "createdAt"> & { id?: string }): Task {
+    const id = task.id ?? `task-${this.tasks.length + 1}`;
+    const t: Task = { id, createdAt: Date.now(), ...task };
+    this.tasks.push(t);
+    return t;
+  }
+
+  toggleDone(id: string): Task {
+    const t = this.mustGet(id);
+    t.done = !t.done;
+    return t;
+  }
+
+  setDone(id: string, done: boolean): Task {
+    const t = this.mustGet(id);
+    t.done = done;
+    return t;
+  }
+
+  setPriority(id: string, priority: Priority): Task {
+    const t = this.mustGet(id);
+    t.priority = priority;
+    return t;
+  }
+
+  setTag(id: string, tag: string | null): Task {
+    const t = this.mustGet(id);
+    t.tag = tag;
+    return t;
+  }
+
+  editTitle(id: string, title: string): Task {
+    const t = this.mustGet(id);
+    t.title = title;
+    return t;
+  }
+
+  delete(id: string): void {
+    this.tasks = this.tasks.filter((t) => t.id !== id);
+  }
+}
diff --git a/benchmarks/v2/config/ablation.ts b/benchmarks/v2/config/ablation.ts
new file mode 100644
index 0000000..fe457b3
--- /dev/null
+++ b/benchmarks/v2/config/ablation.ts
@@ -0,0 +1,31 @@
+import type { SweepConfig } from "../runner/types.ts";
+
+const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1";
+
+/**
+ * First real ablation: prompts × encodings × protocols on the todo app's
+ * fastest scenario at small scale. Goal is to light up every registry entry
+ * at least once and let the dashboard pivot across dimensions.
+ *
+ * Cell math: 3 prompts × 3 encodings × 1 optimization = 9 SLOP cells,
+ * plus 2 MCP variants = 11 cells × 1 iteration.
+ */
+export const ablationSweep: SweepConfig = {
+  id: "ablation-prompts-encodings",
+  providers: [
+    { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" },
+  ],
+  promptVariants: ["minimal", "spec", "spec-terse"],
+  encodingVariants: ["indented-text", "json-compact", "markdown-headings"],
+  optimizationVariants: ["off"],
+  protocols: ["slop", "mcp"],
+  mcpVariants: ["flat", "flat+prompt"],
+  apps: ["todo"],
+  dataScales: ["s"],
+  scenarioFilter: ["mark-all-done"],
+  seeds: [42],
+  iterations: 1,
+  maxConcurrency: 1,
+  maxTurns: 30,
+  temperature: 0,
+};
diff --git a/benchmarks/v2/config/smoke-crm.ts b/benchmarks/v2/config/smoke-crm.ts
new file mode 100644
index 0000000..c638fc4
--- /dev/null
+++ b/benchmarks/v2/config/smoke-crm.ts
@@ -0,0 +1,28 @@
+import type { SweepConfig } from "../runner/types.ts";
+
+const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1";
+
+/**
+ * Validation sweep for the crm app. Runs one easy scenario (high-value-alert)
+ * on both SLOP and MCP at scale=s so we can see the top-of-ladder end-to-end
+ * without blowing out token budgets.
+ */
+export const smokeCrmSweep: SweepConfig = {
+  id: "smoke-crm",
+  providers: [
+    { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" },
+  ],
+  promptVariants: ["spec"],
+  encodingVariants: ["indented-text"],
+  optimizationVariants: ["off"],
+  protocols: ["slop", "mcp"],
+  mcpVariants: ["flat"],
+  apps: ["crm"],
+  dataScales: ["s"],
+  scenarioFilter: ["high-value-alert"],
+  seeds: [42],
+  iterations: 1,
+  maxConcurrency: 1,
+  maxTurns: 40,
+  temperature: 0,
+};
diff --git a/benchmarks/v2/config/smoke-file-browser.ts b/benchmarks/v2/config/smoke-file-browser.ts
new file mode 100644
index 0000000..a96e143
--- /dev/null
+++ b/benchmarks/v2/config/smoke-file-browser.ts
@@ -0,0 +1,28 @@
+import type { SweepConfig } from "../runner/types.ts";
+
+const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1";
+
+/**
+ * Validation sweep for the file-browser app. delete-empty-dirs exercises the
+ * state-dependent affordance (delete only available on empty dirs) on SLOP,
+ * which MCP has no equivalent of.
+ */
+export const smokeFileBrowserSweep: SweepConfig = {
+  id: "smoke-file-browser",
+  providers: [
+    { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" },
+  ],
+  promptVariants: ["spec"],
+  encodingVariants: ["indented-text"],
+  optimizationVariants: ["off"],
+  protocols: ["slop", "mcp"],
+  mcpVariants: ["flat"],
+  apps: ["file-browser"],
+  dataScales: ["s"],
+  scenarioFilter: ["delete-empty-dirs"],
+  seeds: [42],
+  iterations: 1,
+  maxConcurrency: 1,
+  maxTurns: 30,
+  temperature: 0,
+};
diff --git a/benchmarks/v2/config/smoke-mcp.ts b/benchmarks/v2/config/smoke-mcp.ts
new file mode 100644
index 0000000..da1253c
--- /dev/null
+++ b/benchmarks/v2/config/smoke-mcp.ts
@@ -0,0 +1,32 @@
+import type { SweepConfig } from "../runner/types.ts";
+
+const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1";
+
+/**
+ * SLOP-vs-MCP head-to-head smoke. One fast scenario, one model, one iter per
+ * cell × 2 protocols = 2 cells. Validates that the MCP cell runner works
+ * end-to-end and that verification via reconstruction passes.
+ */
+export const smokeMcpSweep: SweepConfig = {
+  id: "smoke-mcp",
+  providers: [
+    {
+      kind: "openai-compat",
+      baseUrl: DGX_URL,
+      model: "gemma4:31b",
+    },
+  ],
+  promptVariants: ["spec"],
+  encodingVariants: ["indented-text"],
+  optimizationVariants: ["off"],
+  protocols: ["slop", "mcp"],
+  mcpVariants: ["flat"],
+  apps: ["issue-tracker"],
+  dataScales: ["s"],
+  scenarioFilter: ["explore-and-act"],
+  seeds: [42],
+  iterations: 1,
+  maxConcurrency: 1,
+  maxTurns: 20,
+  temperature: 0,
+};
diff --git a/benchmarks/v2/config/smoke-todo.ts b/benchmarks/v2/config/smoke-todo.ts
new file mode 100644
index 0000000..7875e55
--- /dev/null
+++ b/benchmarks/v2/config/smoke-todo.ts
@@ -0,0 +1,27 @@
+import type { SweepConfig } from "../runner/types.ts";
+
+const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1";
+
+/**
+ * Validation sweep for the todo app. Small scale, one scenario, SLOP vs MCP.
+ * Confirms the new app binding works end-to-end.
+ */
+export const smokeTodoSweep: SweepConfig = {
+  id: "smoke-todo",
+  providers: [
+    { kind: "openai-compat", baseUrl: DGX_URL, model: "gemma4:31b" },
+  ],
+  promptVariants: ["spec"],
+  encodingVariants: ["indented-text"],
+  optimizationVariants: ["off"],
+  protocols: ["slop", "mcp"],
+  mcpVariants: ["flat"],
+  apps: ["todo"],
+  dataScales: ["s"],
+  scenarioFilter: ["mark-all-done"],
+  seeds: [42],
+  iterations: 1,
+  maxConcurrency: 1,
+  maxTurns: 30,
+  temperature: 0,
+};
diff --git a/benchmarks/v2/config/smoke.ts b/benchmarks/v2/config/smoke.ts
new file mode 100644
index 0000000..4875301
--- /dev/null
+++ b/benchmarks/v2/config/smoke.ts
@@ -0,0 +1,31 @@
+import type { SweepConfig } from "../runner/types.ts";
+
+const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1";
+
+/**
+ * Smoke sweep — smallest useful cross-section. Runs on one model, one prompt,
+ * one encoding, two optimization levels, two scenarios, small data, 1 iter.
+ * Target wall time: a couple of minutes on DGX gemma4:31b.
+ */
+export const smokeSweep: SweepConfig = {
+  id: "smoke",
+  providers: [
+    {
+      kind: "openai-compat",
+      baseUrl: DGX_URL,
+      model: "gemma4:31b",
+    },
+  ],
+  promptVariants: ["spec"],
+  encodingVariants: ["indented-text"],
+  optimizationVariants: ["off", "combined"],
+  protocols: ["slop"],
+  apps: ["issue-tracker"],
+  dataScales: ["s"],
+  scenarioFilter: ["explore-and-act"],
+  seeds: [42],
+  iterations: 3,
+  maxConcurrency: 1,
+  maxTurns: 20,
+  temperature: 0,
+};
diff --git a/benchmarks/v2/dashboard/app.js b/benchmarks/v2/dashboard/app.js
new file mode 100644
index 0000000..6f88e0e
--- /dev/null
+++ b/benchmarks/v2/dashboard/app.js
@@ -0,0 +1,433 @@
+// SLOP benchmarks v2 — dashboard client. Aggregates runs.jsonl in the browser
+// so the user can pivot on any two axes without regenerating data. Mirrors
+// the math in benchmarks/v2/metrics/stats.ts (kept deliberately tiny).
+
+const AXES = [
+  { id: "app", label: "app", pick: (c) => c.cell.app },
+  { id: "scale", label: "scale", pick: (c) => c.cell.scale },
+  { id: "scenario", label: "scenario", pick: (c) => c.cell.scenario },
+  { id: "variant", label: "variant", pick: (c) => variantLabel(c.cell) },
+  { id: "model", label: "model", pick: (c) => `${c.cell.provider.kind}:${c.cell.provider.model}` },
+  { id: "protocol", label: "protocol", pick: (c) => c.cell.protocol },
+  { id: "optimization", label: "optimization", pick: (c) => c.cell.optimization },
+];
+
+const METRICS = [
+  { id: "passRate", label: "pass rate", format: (v) => `${(v * 100).toFixed(0)}%`, pick: (a) => a.passRate },
+  { id: "totalTokens", label: "tokens (mean)", format: fmtInt, pick: (a) => a.totalTokens.mean },
+  { id: "tokensPerSuccess", label: "tokens per success", format: fmtInt, pick: (a) => a.tokensPerSuccess },
+  { id: "maxContextTokens", label: "max context tokens", format: fmtInt, pick: (a) => a.maxContextTokens.mean },
+  { id: "turns", label: "turns (mean)", format: (v) => v.toFixed(1), pick: (a) => a.turns.mean },
+  { id: "toolCalls", label: "tool calls (mean)", format: (v) => v.toFixed(1), pick: (a) => a.toolCalls.mean },
+  { id: "specComplianceRate", label: "spec compliance", format: (v) => `${(v * 100).toFixed(0)}%`, pick: (a) => a.specComplianceRate.mean },
+  { id: "totalTimeS", label: "wall time (s)", format: (v) => v.toFixed(1), pick: (a) => a.totalTimeMs.mean / 1000 },
+  { id: "llmTimeS", label: "llm time (s)", format: (v) => v.toFixed(1), pick: (a) => a.llmTimeMs.mean / 1000 },
+  { id: "costPerSuccess", label: "$ per success", format: fmtCost, pick: (a) => a.costPerSuccess },
+];
+
+const state = {
+  runs: [],
+  cellAggregates: [],
+  rowAxis: "variant",
+  colAxis: "scenario",
+  metric: "totalTokens",
+  filters: { app: "", scenario: "", scale: "" },
+};
+
+init();
+
+async function init() {
+  populateSelect("rowAxis", AXES.map((a) => ({ value: a.id, label: a.label })));
+  populateSelect("colAxis", AXES.map((a) => ({ value: a.id, label: a.label })));
+  populateSelect("metric", METRICS.map((m) => ({ value: m.id, label: m.label })));
+  document.getElementById("rowAxis").value = state.rowAxis;
+  document.getElementById("colAxis").value = state.colAxis;
+  document.getElementById("metric").value = state.metric;
+
+  document.getElementById("rowAxis").addEventListener("change", (e) => {
+    state.rowAxis = e.target.value;
+    render();
+  });
+  document.getElementById("colAxis").addEventListener("change", (e) => {
+    state.colAxis = e.target.value;
+    render();
+  });
+  document.getElementById("metric").addEventListener("change", (e) => {
+    state.metric = e.target.value;
+    render();
+  });
+  for (const key of ["filterApp", "filterScenario", "filterScale"]) {
+    document.getElementById(key).addEventListener("change", (e) => {
+      const filterKey = key.replace("filter", "").toLowerCase();
+      state.filters[filterKey] = e.target.value;
+      render();
+    });
+  }
+  document.getElementById("fileInput").addEventListener("change", onFileSelected);
+  document.getElementById("sweep").addEventListener("change", (e) => {
+    if (e.target.value) loadSweepByName(e.target.value);
+  });
+
+  // Try to auto-discover sweeps (when served from bun)
+  try {
+    const res = await fetch("/sweeps");
+    if (res.ok) {
+      const list = await res.json();
+      const sel = document.getElementById("sweep");
+      sel.innerHTML = '<option value="">—</option>';
+      for (const name of list) {
+        const opt = document.createElement("option");
+        opt.value = name;
+        opt.textContent = name;
+        sel.appendChild(opt);
+      }
+      const qsSweep = new URL(location.href).searchParams.get("sweep");
+      if (qsSweep && list.includes(qsSweep)) {
+        sel.value = qsSweep;
+        loadSweepByName(qsSweep);
+      } else if (list.length > 0) {
+        sel.value = list[list.length - 1];
+        loadSweepByName(sel.value);
+      }
+    }
+  } catch {
+    // Not served from the bun dashboard server — user must pick a file manually.
+  }
+}
+
+async function loadSweepByName(name) {
+  setStatus(`loading ${name}…`);
+  try {
+    const res = await fetch(`/results/${name}/runs.jsonl`);
+    if (!res.ok) throw new Error(`HTTP ${res.status}`);
+    const text = await res.text();
+    loadRunsText(text, name);
+  } catch (err) {
+    setStatus(`failed to load: ${err.message}`);
+  }
+}
+
+function onFileSelected(e) {
+  const file = e.target.files?.[0];
+  if (!file) return;
+  const reader = new FileReader();
+  reader.onload = () => loadRunsText(String(reader.result), file.name);
+  reader.readAsText(file);
+}
+
+function loadRunsText(text, source) {
+  const lines = text.split("\n").filter((l) => l.trim().length > 0);
+  const runs = [];
+  let sweepConfig = null;
+  for (const line of lines) {
+    let obj;
+    try {
+      obj = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    if (obj.type === "sweep") sweepConfig = obj.config;
+    else if (obj.cellId) runs.push(obj);
+  }
+  state.runs = runs;
+  state.cellAggregates = aggregateCells(runs);
+  setStatus(`loaded ${source}: ${runs.length} runs, ${state.cellAggregates.length} cells`);
+
+  // Populate filter selects from data
+  populateFilter("filterApp", runs.map((r) => r.cell.app));
+  populateFilter("filterScenario", runs.map((r) => r.cell.scenario));
+  populateFilter("filterScale", runs.map((r) => r.cell.scale));
+  render();
+}
+
+function render() {
+  const host = document.getElementById("table-host");
+  const cells = filterCells(state.cellAggregates, state.filters);
+  if (cells.length === 0) {
+    host.innerHTML = '<div class="empty-message">no cells match the current filters</div>';
+    return;
+  }
+  const rowAxis = AXES.find((a) => a.id === state.rowAxis);
+  const colAxis = AXES.find((a) => a.id === state.colAxis);
+  const metric = METRICS.find((m) => m.id === state.metric);
+
+  const rowValues = unique(cells.map((c) => rowAxis.pick(c)));
+  const colValues = unique(cells.map((c) => colAxis.pick(c)));
+
+  const grid = new Map();
+  for (const c of cells) {
+    const key = `${rowAxis.pick(c)}__${colAxis.pick(c)}`;
+    const list = grid.get(key) ?? [];
+    list.push(c);
+    grid.set(key, list);
+  }
+
+  let html = "<table><thead><tr><th class='row-header'></th>";
+  for (const col of colValues) html += `<th>${escapeHtml(col)}</th>`;
+  html += "</tr></thead><tbody>";
+  for (const row of rowValues) {
+    html += `<tr><td class='row-header'>${escapeHtml(row)}</td>`;
+    for (const col of colValues) {
+      const list = grid.get(`${row}__${col}`) ?? [];
+      if (list.length === 0) {
+        html += "<td class='empty'>—</td>";
+      } else {
+        const mergedPass = mergePassRate(list);
+        const val = mergeMetric(list, metric);
+        const cellId = list.map((c) => c.cellId).join(",");
+        const sample = list.reduce((a, c) => a + c.runs, 0);
+        const passClass = mergedPass === 1 ? "" : mergedPass >= 0.5 ? "warn" : "bad";
+        html += `<td class='cell' data-key='${escapeHtml(row)}__${escapeHtml(col)}' data-cells='${cellId}'>` +
+          `<div class='primary'>${escapeHtml(metric.format(val))}</div>` +
+          `<div class='secondary'>n=${sample}${list[0].runs > 1 ? ` ± ${fmtInt(stdevOf(list, metric))}` : ""}</div>` +
+          `<div class='passbar ${passClass}'><span style='width:${(mergedPass * 100).toFixed(0)}%'></span></div>` +
+          "</td>";
+      }
+    }
+    html += "</tr>";
+  }
+  html += "</tbody></table>";
+  host.innerHTML = html;
+
+  host.querySelectorAll("td.cell").forEach((td) => {
+    td.addEventListener("click", () => openCellDetail(td.dataset.cells));
+  });
+}
+
+function mergePassRate(aggs) {
+  const totalRuns = aggs.reduce((a, c) => a + c.runs, 0);
+  if (totalRuns === 0) return 0;
+  const totalPass = aggs.reduce((a, c) => a + c.passRate * c.runs, 0);
+  return totalPass / totalRuns;
+}
+
+function mergeMetric(aggs, metric) {
+  // Weighted mean across all aggregates matching the pivot cell.
+  const totalRuns = aggs.reduce((a, c) => a + c.runs, 0);
+  if (totalRuns === 0) return 0;
+  let sum = 0;
+  for (const c of aggs) {
+    const v = metric.pick(c);
+    if (!isFinite(v)) continue;
+    sum += v * c.runs;
+  }
+  return sum / totalRuns;
+}
+
+function stdevOf(aggs, metric) {
+  // Pooled approximate stdev — good enough for a dashboard hover.
+  let n = 0;
+  let ssq = 0;
+  let mean = 0;
+  for (const c of aggs) {
+    const nv = c.runs;
+    const mv = metric.pick(c);
+    if (!isFinite(mv)) continue;
+    const delta = mv - mean;
+    n += nv;
+    mean += (delta * nv) / n;
+    ssq += nv * delta * (mv - mean);
+  }
+  if (n < 2) return 0;
+  return Math.sqrt(ssq / (n - 1));
+}
+
+function openCellDetail(cellIds) {
+  const ids = cellIds.split(",");
+  const aggs = state.cellAggregates.filter((c) => ids.includes(c.cellId));
+  if (aggs.length === 0) return;
+  const runs = state.runs.filter((r) => ids.includes(r.cellId));
+  const title = `${aggs.map((a) => variantLabel(a.cell)).join(" / ")}`;
+  document.getElementById("modal-title").textContent = title;
+  const body = document.getElementById("modal-body");
+
+  let html = "";
+  for (const a of aggs) {
+    const cats = a.failureCategories;
+    const total = Object.values(cats).reduce((s, v) => s + v, 0);
+    html += `<h3 style='font-size:12px; color: var(--accent); margin:12px 0 6px'>${escapeHtml(variantLabel(a.cell))} × ${escapeHtml(a.cell.scenario)}</h3>`;
+    html += "<dl class='kv'>";
+    html += `<dt>cellId</dt><dd>${escapeHtml(a.cellId)}</dd>`;
+    html += `<dt>runs</dt><dd>${a.runs}</dd>`;
+    html += `<dt>pass rate</dt><dd>${(a.passRate * 100).toFixed(0)}%</dd>`;
+    html += `<dt>spec compliance</dt><dd>${(a.specComplianceRate.mean * 100).toFixed(0)}%</dd>`;
+    html += `<dt>tokens</dt><dd>${fmtInt(a.totalTokens.mean)} (p95 ${fmtInt(a.totalTokens.p95)}, σ ${fmtInt(a.totalTokens.stdev)})</dd>`;
+    html += `<dt>max context</dt><dd>${fmtInt(a.maxContextTokens.mean)}</dd>`;
+    html += `<dt>turns</dt><dd>${a.turns.mean.toFixed(1)}</dd>`;
+    html += `<dt>tool calls</dt><dd>${a.toolCalls.mean.toFixed(1)}</dd>`;
+    html += `<dt>wall time</dt><dd>${(a.totalTimeMs.mean / 1000).toFixed(1)}s</dd>`;
+    html += `<dt>llm time</dt><dd>${(a.llmTimeMs.mean / 1000).toFixed(1)}s</dd>`;
+    html += `<dt>$ per success</dt><dd>${fmtCost(a.costPerSuccess)}</dd>`;
+    html += "</dl>";
+    if (total > 0) {
+      html += "<div class='category-bar'>";
+      for (const [key, val] of Object.entries(cats)) {
+        if (val === 0) continue;
+        html += `<div class='cat-${key}' style='width:${(val / total * 100).toFixed(1)}%' title='${key}: ${val}'></div>`;
+      }
+      html += "</div>";
+      html += `<div class='secondary' style='font-size: 11px; color: var(--text-dim); margin-top: 4px'>${Object.entries(cats).filter(([, v]) => v > 0).map(([k, v]) => `${k}=${v}`).join(" / ")}</div>`;
+    }
+  }
+
+  // Per-run table
+  html += "<h3 style='font-size:12px; color: var(--accent); margin: 16px 0 6px'>per-run detail</h3>";
+  html += "<pre>";
+  html += `${"iter".padEnd(5)}${"turns".padStart(7)}${"calls".padStart(7)}${"spec%".padStart(7)}${"tok".padStart(9)}${"ctxMx".padStart(8)}${"t(s)".padStart(8)} verify\n`;
+  for (const r of runs) {
+    const m = r.metrics;
+    if (!m) {
+      html += `${String(r.cell.iteration).padEnd(5)} ERROR: ${escapeHtml((r.error ?? "").split("\n")[0])}\n`;
+      continue;
+    }
+    const v = m.verification ? `${m.verification.passedChecks}/${m.verification.totalChecks}` : "—";
+    html += `${String(r.cell.iteration).padEnd(5)}${String(m.turns).padStart(7)}${String(m.toolCalls).padStart(7)}${`${(m.specComplianceRate * 100).toFixed(0)}%`.padStart(7)}${fmtInt(m.totalTokens).padStart(9)}${fmtInt(m.maxContextTokens).padStart(8)}${(m.totalTimeMs / 1000).toFixed(1).padStart(8)} ${v}\n`;
+  }
+  html += "</pre>";
+
+  body.innerHTML = html;
+  document.getElementById("modal").showModal();
+}
+
+function aggregateCells(runs) {
+  const buckets = new Map();
+  for (const r of runs) {
+    const list = buckets.get(r.cellId) ?? [];
+    list.push(r);
+    buckets.set(r.cellId, list);
+  }
+  const out = [];
+  for (const [cellId, bucket] of buckets) {
+    const metrics = bucket.map((r) => r.metrics).filter((m) => m);
+    if (metrics.length === 0) continue;
+    const first = bucket[0];
+    const passCount = bucket.filter((r) => r.metrics?.verification?.passed === true).length;
+    const passRate = passCount / bucket.length;
+    const agg = (pick) => numericAgg(metrics.map(pick));
+    const pricing = { "gemma4:31b": [0, 0], "gemma4:e4b-it": [0, 0], "nemotron-3-super:120b": [0, 0] };
+    const price = pricing[first.cell.provider.model] ?? [0, 0];
+    const costMean = metrics.reduce((s, m) => s + (m.inputTokens * price[0] + m.outputTokens * price[1]) / 1_000_000, 0) / metrics.length;
+    const costAgg = { count: metrics.length, mean: costMean, median: costMean, p95: costMean, stdev: 0, min: costMean, max: costMean };
+    out.push({
+      cellId,
+      cell: first.cell,
+      runs: bucket.length,
+      passRate,
+      failureCategories: countCategories(bucket),
+      totalTokens: agg((m) => m.totalTokens),
+      inputTokens: agg((m) => m.inputTokens),
+      outputTokens: agg((m) => m.outputTokens),
+      maxContextTokens: agg((m) => m.maxContextTokens),
+      turns: agg((m) => m.turns),
+      toolCalls: agg((m) => m.toolCalls),
+      specComplianceRate: agg((m) => m.specComplianceRate),
+      llmTimeMs: agg((m) => m.llmTimeMs),
+      totalTimeMs: agg((m) => m.totalTimeMs),
+      transportBytes: agg((m) => m.transportBytesSent + m.transportBytesReceived),
+      costUsd: costAgg,
+      costPerSuccess: passCount > 0 ? (costMean * bucket.length) / passCount : Infinity,
+      tokensPerSuccess: passCount > 0 ? (agg((m) => m.totalTokens).mean * bucket.length) / passCount : Infinity,
+    });
+  }
+  return out;
+}
+
+function countCategories(runs) {
+  const counts = { ok: 0, no_verifier: 0, verify_fail: 0, max_turns: 0, tool_unknown: 0, tool_invoke_error: 0, tool_param_error: 0, cell_exception: 0 };
+  for (const r of runs) {
+    if (r.error) { counts.cell_exception += 1; continue; }
+    const m = r.metrics;
+    if (!m) continue;
+    if (m.unknownToolCalls > 0) counts.tool_unknown += 1;
+    if (m.invokeErrorCalls > 0) counts.tool_invoke_error += 1;
+    if (m.paramErrorCalls > 0) counts.tool_param_error += 1;
+    if (m.finishReason === "max_turns") counts.max_turns += 1;
+    if (!m.verification) { counts.no_verifier += 1; continue; }
+    if (m.verification.passed) counts.ok += 1;
+    else counts.verify_fail += 1;
+  }
+  return counts;
+}
+
+function numericAgg(values) {
+  const vs = values.filter((v) => Number.isFinite(v));
+  if (vs.length === 0) return { count: 0, mean: 0, median: 0, p95: 0, stdev: 0, min: 0, max: 0 };
+  const sorted = [...vs].sort((a, b) => a - b);
+  const mean = vs.reduce((a, b) => a + b, 0) / vs.length;
+  const median = percentile(sorted, 0.5);
+  const p95 = percentile(sorted, 0.95);
+  const stdev = vs.length > 1 ? Math.sqrt(vs.reduce((acc, v) => acc + (v - mean) ** 2, 0) / (vs.length - 1)) : 0;
+  return { count: vs.length, mean, median, p95, stdev, min: sorted[0], max: sorted[sorted.length - 1] };
+}
+
+function percentile(sorted, q) {
+  if (sorted.length === 0) return 0;
+  if (sorted.length === 1) return sorted[0];
+  const idx = q * (sorted.length - 1);
+  const lo = Math.floor(idx);
+  const hi = Math.ceil(idx);
+  if (lo === hi) return sorted[lo];
+  return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
+}
+
+function variantLabel(cell) {
+  if (cell.protocol === "mcp") return `mcp:${cell.mcpVariant ?? "flat"}`;
+  return `slop:${cell.prompt}/${cell.encoding}/${cell.optimization}`;
+}
+
+function filterCells(cells, filters) {
+  return cells.filter((c) => {
+    if (filters.app && c.cell.app !== filters.app) return false;
+    if (filters.scenario && c.cell.scenario !== filters.scenario) return false;
+    if (filters.scale && c.cell.scale !== filters.scale) return false;
+    return true;
+  });
+}
+
+function unique(arr) {
+  return Array.from(new Set(arr));
+}
+
+function populateSelect(id, items) {
+  const sel = document.getElementById(id);
+  sel.innerHTML = "";
+  for (const item of items) {
+    const opt = document.createElement("option");
+    opt.value = item.value;
+    opt.textContent = item.label;
+    sel.appendChild(opt);
+  }
+}
+
+function populateFilter(id, values) {
+  const sel = document.getElementById(id);
+  const current = sel.value;
+  sel.innerHTML = '<option value="">all</option>';
+  for (const v of unique(values)) {
+    const opt = document.createElement("option");
+    opt.value = v;
+    opt.textContent = v;
+    sel.appendChild(opt);
+  }
+  if (unique(values).includes(current)) sel.value = current;
+}
+
+function fmtInt(v) {
+  if (!Number.isFinite(v)) return "∞";
+  return Math.round(v).toLocaleString();
+}
+
+function fmtCost(v) {
+  if (!Number.isFinite(v)) return "∞";
+  if (v === 0) return "$0";
+  if (v < 0.01) return `$${v.toFixed(4)}`;
+  return `$${v.toFixed(3)}`;
+}
+
+function escapeHtml(s) {
+  return String(s).replace(/[&<>"']/g, (c) => ({ "&": "&amp;", "<": "&lt;", ">": "&gt;", '"': "&quot;", "'": "&#39;" }[c]));
+}
+
+function setStatus(msg) {
+  document.getElementById("status").textContent = msg;
+}
diff --git a/benchmarks/v2/dashboard/index.html b/benchmarks/v2/dashboard/index.html
new file mode 100644
index 0000000..b2abdb1
--- /dev/null
+++ b/benchmarks/v2/dashboard/index.html
@@ -0,0 +1,131 @@
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <title>SLOP benchmarks v2 — dashboard</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <style>
+    :root {
+      --bg: #0b0f14;
+      --panel: #121821;
+      --panel-2: #1a2230;
+      --border: #263146;
+      --text: #e7edf5;
+      --text-dim: #8a9bb5;
+      --accent: #9ef87a;
+      --good: #6ee68a;
+      --bad: #f47373;
+      --warn: #f2c14e;
+      --mono: "SF Mono", "Menlo", "Monaco", Consolas, monospace;
+    }
+    * { box-sizing: border-box; }
+    body { margin: 0; font-family: var(--mono); background: var(--bg); color: var(--text); font-size: 13px; }
+    header { padding: 12px 16px; border-bottom: 1px solid var(--border); display: flex; gap: 16px; align-items: baseline; background: var(--panel); }
+    header h1 { margin: 0; font-size: 14px; font-weight: 500; letter-spacing: 0.5px; color: var(--accent); }
+    header .status { color: var(--text-dim); }
+    .controls { padding: 12px 16px; display: flex; flex-wrap: wrap; gap: 12px; align-items: flex-end; background: var(--panel); border-bottom: 1px solid var(--border); }
+    .control { display: flex; flex-direction: column; gap: 4px; }
+    .control label { font-size: 11px; color: var(--text-dim); text-transform: uppercase; letter-spacing: 0.5px; }
+    .control select, .control input { background: var(--panel-2); color: var(--text); border: 1px solid var(--border); padding: 6px 8px; font-family: inherit; font-size: 12px; min-width: 140px; }
+    .content { padding: 16px; }
+    table { width: 100%; border-collapse: collapse; background: var(--panel); }
+    th, td { border: 1px solid var(--border); padding: 8px 10px; text-align: right; }
+    th { background: var(--panel-2); color: var(--text-dim); font-weight: 500; text-align: right; position: sticky; top: 0; z-index: 1; }
+    th.row-header, td.row-header { text-align: left; color: var(--text-dim); background: var(--panel-2); }
+    td { font-variant-numeric: tabular-nums; cursor: pointer; transition: background 0.1s; }
+    td.cell:hover { background: var(--panel-2); }
+    td.empty { color: var(--text-dim); background: var(--bg); cursor: default; }
+    .cell .primary { color: var(--text); font-size: 13px; }
+    .cell .secondary { color: var(--text-dim); font-size: 11px; margin-top: 2px; }
+    .passbar { display: inline-block; height: 4px; width: 60px; background: var(--panel-2); border-radius: 2px; margin-top: 4px; overflow: hidden; }
+    .passbar > span { display: block; height: 100%; background: var(--good); }
+    .passbar.bad > span { background: var(--bad); }
+    .passbar.warn > span { background: var(--warn); }
+    .legend { display: flex; gap: 20px; margin-top: 12px; color: var(--text-dim); font-size: 11px; }
+    .legend span { display: flex; align-items: center; gap: 6px; }
+    .legend .dot { display: inline-block; width: 10px; height: 10px; border-radius: 2px; }
+    .empty-message { color: var(--text-dim); padding: 40px; text-align: center; }
+    dialog { background: var(--panel); color: var(--text); border: 1px solid var(--border); padding: 0; max-width: 720px; width: 90%; font-family: inherit; }
+    dialog::backdrop { background: rgba(0,0,0,0.65); }
+    dialog .modal-header { padding: 12px 16px; border-bottom: 1px solid var(--border); display: flex; justify-content: space-between; align-items: center; }
+    dialog .modal-header h2 { margin: 0; font-size: 13px; font-weight: 500; color: var(--accent); }
+    dialog .modal-body { padding: 16px; max-height: 70vh; overflow: auto; }
+    dialog pre { background: var(--bg); border: 1px solid var(--border); padding: 10px; font-size: 11px; overflow: auto; white-space: pre-wrap; word-break: break-word; }
+    .close-btn { background: none; border: 1px solid var(--border); color: var(--text); font-family: inherit; padding: 4px 10px; cursor: pointer; }
+    .close-btn:hover { background: var(--panel-2); }
+    .kv { display: grid; grid-template-columns: 140px 1fr; gap: 4px 12px; font-size: 12px; }
+    .kv dt { color: var(--text-dim); }
+    .category-bar { display: flex; height: 6px; margin-top: 8px; border-radius: 3px; overflow: hidden; }
+    .category-bar > div { height: 100%; }
+    .cat-ok { background: var(--good); }
+    .cat-max_turns { background: var(--warn); }
+    .cat-verify_fail { background: var(--bad); }
+    .cat-tool_unknown { background: #f4a261; }
+    .cat-tool_invoke_error { background: #e76f51; }
+    .cat-tool_param_error { background: #cb997e; }
+    .cat-cell_exception { background: #d62828; }
+    .cat-no_verifier { background: #5f6a7d; }
+  </style>
+</head>
+<body>
+  <header>
+    <h1>SLOP benchmarks v2</h1>
+    <span class="status" id="status">pick a sweep to load</span>
+  </header>
+
+  <div class="controls">
+    <div class="control">
+      <label for="sweep">sweep</label>
+      <select id="sweep"><option value="">—</option></select>
+    </div>
+    <div class="control">
+      <label for="fileInput">or file</label>
+      <input type="file" id="fileInput" accept=".jsonl,.json" />
+    </div>
+    <div class="control">
+      <label for="rowAxis">rows</label>
+      <select id="rowAxis"></select>
+    </div>
+    <div class="control">
+      <label for="colAxis">columns</label>
+      <select id="colAxis"></select>
+    </div>
+    <div class="control">
+      <label for="metric">metric</label>
+      <select id="metric"></select>
+    </div>
+    <div class="control">
+      <label for="filterApp">app</label>
+      <select id="filterApp"><option value="">all</option></select>
+    </div>
+    <div class="control">
+      <label for="filterScenario">scenario</label>
+      <select id="filterScenario"><option value="">all</option></select>
+    </div>
+    <div class="control">
+      <label for="filterScale">scale</label>
+      <select id="filterScale"><option value="">all</option></select>
+    </div>
+  </div>
+
+  <div class="content">
+    <div id="table-host"></div>
+    <div class="legend">
+      <span><span class="dot" style="background: var(--good)"></span>pass rate 100%</span>
+      <span><span class="dot" style="background: var(--warn)"></span>pass rate 50–99%</span>
+      <span><span class="dot" style="background: var(--bad)"></span>pass rate &lt;50%</span>
+      <span>cells are clickable — per-run drill-down</span>
+    </div>
+  </div>
+
+  <dialog id="modal">
+    <div class="modal-header">
+      <h2 id="modal-title">cell detail</h2>
+      <button class="close-btn" onclick="document.getElementById('modal').close()">close</button>
+    </div>
+    <div class="modal-body" id="modal-body"></div>
+  </dialog>
+
+  <script type="module" src="./app.js"></script>
+</body>
+</html>
diff --git a/benchmarks/v2/dashboard/serve.ts b/benchmarks/v2/dashboard/serve.ts
new file mode 100644
index 0000000..402dcf7
--- /dev/null
+++ b/benchmarks/v2/dashboard/serve.ts
@@ -0,0 +1,71 @@
+/**
+ * Tiny bun server for the benchmarks dashboard. Serves the static dashboard
+ * files from this directory and exposes /results/<sweep>/runs.jsonl plus a
+ * /sweeps endpoint that lists every sweep id with a runs.jsonl on disk.
+ *
+ * Run with: `bun run dashboard/serve.ts` (or `bun run dash` from v2/).
+ */
+import { existsSync, readFileSync, readdirSync, statSync } from "node:fs";
+import { join, resolve } from "node:path";
+
+const DASH_DIR = new URL(".", import.meta.url).pathname;
+const V2_ROOT = resolve(DASH_DIR, "..");
+const RESULTS_DIR = join(V2_ROOT, "results");
+const PORT = Number(process.env.DASH_PORT ?? 4180);
+
+const MIME: Record<string, string> = {
+  html: "text/html; charset=utf-8",
+  js: "application/javascript; charset=utf-8",
+  css: "text/css; charset=utf-8",
+  json: "application/json; charset=utf-8",
+  jsonl: "application/x-ndjson; charset=utf-8",
+};
+
+function contentType(path: string): string {
+  const ext = path.split(".").pop() ?? "";
+  return MIME[ext] ?? "text/plain; charset=utf-8";
+}
+
+Bun.serve({
+  port: PORT,
+  async fetch(req) {
+    const url = new URL(req.url);
+    const path = decodeURIComponent(url.pathname);
+
+    if (path === "/sweeps") {
+      if (!existsSync(RESULTS_DIR)) return Response.json([]);
+      const sweeps: string[] = [];
+      for (const entry of readdirSync(RESULTS_DIR)) {
+        const runsPath = join(RESULTS_DIR, entry, "runs.jsonl");
+        if (existsSync(runsPath)) sweeps.push(entry);
+      }
+      sweeps.sort((a, b) => {
+        const ma = statSync(join(RESULTS_DIR, a, "runs.jsonl")).mtimeMs;
+        const mb = statSync(join(RESULTS_DIR, b, "runs.jsonl")).mtimeMs;
+        return ma - mb;
+      });
+      return Response.json(sweeps);
+    }
+
+    if (path.startsWith("/results/")) {
+      const rel = path.slice("/results/".length);
+      const file = resolve(RESULTS_DIR, rel);
+      if (!file.startsWith(RESULTS_DIR)) return new Response("forbidden", { status: 403 });
+      if (!existsSync(file) || !statSync(file).isFile()) return new Response("not found", { status: 404 });
+      return new Response(readFileSync(file), { headers: { "Content-Type": contentType(file) } });
+    }
+
+    // Dashboard static files
+    const localPath = path === "/" ? "/index.html" : path;
+    const file = resolve(DASH_DIR, `.${localPath}`);
+    if (!file.startsWith(DASH_DIR)) return new Response("forbidden", { status: 403 });
+    if (existsSync(file) && statSync(file).isFile()) {
+      return new Response(readFileSync(file), { headers: { "Content-Type": contentType(file) } });
+    }
+    return new Response("not found", { status: 404 });
+  },
+});
+
+console.log(`[dashboard] http://localhost:${PORT}`);
+console.log(`[dashboard] serving dashboard from ${DASH_DIR}`);
+console.log(`[dashboard] serving results from  ${RESULTS_DIR}`);
diff --git a/benchmarks/v2/metrics/aggregate.ts b/benchmarks/v2/metrics/aggregate.ts
new file mode 100644
index 0000000..e0415ff
--- /dev/null
+++ b/benchmarks/v2/metrics/aggregate.ts
@@ -0,0 +1,104 @@
+import { readFileSync, writeFileSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { parseArgs } from "node:util";
+import { aggregateCells, loadRuns, type CellAggregate, type NumericAggregate } from "./stats.ts";
+import { cellLabel } from "../runner/hash.ts";
+
+const { values } = parseArgs({
+  options: {
+    input: { type: "string" },
+    json: { type: "boolean", default: false },
+  },
+});
+
+const input = values.input;
+if (!input) {
+  console.error("usage: bun run metrics/aggregate.ts --input results/<sweep-id>/runs.jsonl [--json]");
+  process.exit(1);
+}
+
+const raw = readFileSync(input, "utf8").split("\n");
+const records = loadRuns(raw);
+if (records.length === 0) {
+  console.error(`no run records found in ${input}`);
+  process.exit(1);
+}
+const aggregates = aggregateCells(records);
+
+if (values.json) {
+  const out = join(dirname(input), "aggregated.json");
+  writeFileSync(out, JSON.stringify({ source: input, runs: records.length, cells: aggregates }, null, 2));
+  console.log(`wrote ${out}`);
+}
+
+printTable(aggregates);
+
+function printTable(cells: CellAggregate[]) {
+  cells.sort((a, b) => a.cellId.localeCompare(b.cellId));
+  const header = [
+    "cell".padEnd(18),
+    "N".padStart(3),
+    "pass%".padStart(6),
+    "tok̄".padStart(7),
+    "tok₉₅".padStart(7),
+    "ctxMx".padStart(6),
+    "turns̄".padStart(7),
+    "calls̄".padStart(7),
+    "spec%".padStart(6),
+    "t̄(s)".padStart(6),
+    "$/✓".padStart(8),
+    "tok/✓".padStart(9),
+    "label",
+  ];
+  console.log(header.join(" "));
+  for (const c of cells) {
+    const passPct = `${(c.passRate * 100).toFixed(0)}%`;
+    const specPct = `${(c.specComplianceRate.mean * 100).toFixed(0)}%`;
+    const label = cellLabel({ ...c.cell, iteration: 0 }).replace(` | iter=0`, "");
+    console.log(
+      [
+        c.cellId.slice(0, 16).padEnd(18),
+        String(c.runs).padStart(3),
+        passPct.padStart(6),
+        fmt(c.totalTokens, 0).padStart(7),
+        fmt95(c.totalTokens).padStart(7),
+        fmtNum(c.maxContextTokens.mean, 0).padStart(6),
+        fmt(c.turns, 1).padStart(7),
+        fmt(c.toolCalls, 1).padStart(7),
+        specPct.padStart(6),
+        fmtNum(c.totalTimeMs.mean / 1000, 1).padStart(6),
+        fmtCost(c.costPerSuccess).padStart(8),
+        fmtNum(c.tokensPerSuccess, 0).padStart(9),
+        label,
+      ].join(" "),
+    );
+  }
+  const totals = {
+    runs: records.length,
+    pass: records.filter((r) => r.metrics?.verification?.passed).length,
+  };
+  const passRate = totals.runs > 0 ? (totals.pass / totals.runs) * 100 : 0;
+  console.log(`\n${totals.runs} runs, ${totals.pass} pass (${passRate.toFixed(0)}%), ${cells.length} unique cells`);
+}
+
+function fmt(agg: NumericAggregate, digits: number): string {
+  if (agg.count === 0) return "–";
+  return agg.mean.toLocaleString(undefined, { maximumFractionDigits: digits });
+}
+
+function fmt95(agg: NumericAggregate): string {
+  if (agg.count === 0) return "–";
+  return agg.p95.toLocaleString(undefined, { maximumFractionDigits: 0 });
+}
+
+function fmtNum(n: number, digits: number): string {
+  if (!Number.isFinite(n)) return "–";
+  return n.toLocaleString(undefined, { maximumFractionDigits: digits });
+}
+
+function fmtCost(n: number): string {
+  if (!Number.isFinite(n)) return "∞";
+  if (n === 0) return "$0";
+  if (n < 0.01) return `$${n.toFixed(4)}`;
+  return `$${n.toFixed(3)}`;
+}
diff --git a/benchmarks/v2/metrics/cost.ts b/benchmarks/v2/metrics/cost.ts
new file mode 100644
index 0000000..e5b0513
--- /dev/null
+++ b/benchmarks/v2/metrics/cost.ts
@@ -0,0 +1,40 @@
+import type { ProviderConfig } from "../runner/types.ts";
+
+/**
+ * Pricing per million tokens (USD). Local models on DGX cost $0 — we track
+ * them with zeros so cost-per-success is consistent across the matrix and the
+ * dashboard can still show the ratio.
+ */
+export interface ModelPricing {
+  inputPerMillion: number;
+  outputPerMillion: number;
+}
+
+export const PRICING: Record<string, ModelPricing> = {
+  // Local — DGX Ollama
+  "gemma4:31b": { inputPerMillion: 0, outputPerMillion: 0 },
+  "gemma4:e2b-it": { inputPerMillion: 0, outputPerMillion: 0 },
+  "gemma4:e4b-it": { inputPerMillion: 0, outputPerMillion: 0 },
+  "gemma4:26b-a4b-it": { inputPerMillion: 0, outputPerMillion: 0 },
+  "gemma4:31b-it": { inputPerMillion: 0, outputPerMillion: 0 },
+  "nemotron-3-super:120b": { inputPerMillion: 0, outputPerMillion: 0 },
+  // API reference anchors (mirror v1 pricing table).
+  "gemini-2.5-flash": { inputPerMillion: 0.3, outputPerMillion: 2.5 },
+  "gemini-2.5-pro": { inputPerMillion: 1.25, outputPerMillion: 10.0 },
+  "gpt-4.1-nano": { inputPerMillion: 0.1, outputPerMillion: 0.4 },
+  "gpt-4.1-mini": { inputPerMillion: 0.4, outputPerMillion: 1.6 },
+  "gpt-4.1": { inputPerMillion: 2.0, outputPerMillion: 8.0 },
+  "claude-sonnet-4": { inputPerMillion: 3.0, outputPerMillion: 15.0 },
+  "claude-opus-4": { inputPerMillion: 15.0, outputPerMillion: 75.0 },
+};
+
+export function estimateCostUsd(provider: ProviderConfig, inputTokens: number, outputTokens: number): number {
+  const p = PRICING[provider.model];
+  if (!p) return 0;
+  return (inputTokens / 1_000_000) * p.inputPerMillion + (outputTokens / 1_000_000) * p.outputPerMillion;
+}
+
+export function isLocal(provider: ProviderConfig): boolean {
+  const p = PRICING[provider.model];
+  return p !== undefined && p.inputPerMillion === 0 && p.outputPerMillion === 0;
+}
diff --git a/benchmarks/v2/metrics/stats.ts b/benchmarks/v2/metrics/stats.ts
new file mode 100644
index 0000000..ad2f0db
--- /dev/null
+++ b/benchmarks/v2/metrics/stats.ts
@@ -0,0 +1,206 @@
+import type { Cell, RunRecord } from "../runner/types.ts";
+import { estimateCostUsd } from "./cost.ts";
+import { categorizeRun, isSuccess, summarizeCategories, type FailureCategory } from "./taxonomy.ts";
+
+export interface NumericAggregate {
+  count: number;
+  mean: number;
+  median: number;
+  p95: number;
+  stdev: number;
+  min: number;
+  max: number;
+  /** Bootstrap 95% CI on the mean. null if too few samples to bootstrap. */
+  ci95?: [number, number];
+}
+
+export interface CellAggregate {
+  cellId: string;
+  cell: Cell;
+  runs: number;
+  passRate: number;
+  failureCategories: Record<FailureCategory, number>;
+  totalTokens: NumericAggregate;
+  inputTokens: NumericAggregate;
+  outputTokens: NumericAggregate;
+  maxContextTokens: NumericAggregate;
+  turns: NumericAggregate;
+  toolCalls: NumericAggregate;
+  specComplianceRate: NumericAggregate;
+  llmTimeMs: NumericAggregate;
+  totalTimeMs: NumericAggregate;
+  timeToFirstToolCallMs: NumericAggregate;
+  transportBytes: NumericAggregate;
+  costUsd: NumericAggregate;
+  /** Cost per successful run. Infinity when passRate is 0. */
+  costPerSuccess: number;
+  /** Tokens per successful run. */
+  tokensPerSuccess: number;
+}
+
+export function aggregateCells(runs: RunRecord[]): CellAggregate[] {
+  // Dedup by runId, preferring successful records over errored ones. This
+  // handles resume-after-fix: when a cell errored, got fixed, and re-ran,
+  // the jsonl ends up with two records sharing the same runId — one ERR,
+  // one PASS. Count the PASS and drop the ERR.
+  const byRunId = new Map<string, RunRecord>();
+  for (const r of runs) {
+    if (!r.runId) continue;
+    const existing = byRunId.get(r.runId);
+    if (!existing) {
+      byRunId.set(r.runId, r);
+      continue;
+    }
+    const existingHasMetrics = !!existing.metrics && !existing.error;
+    const currentHasMetrics = !!r.metrics && !r.error;
+    if (!existingHasMetrics && currentHasMetrics) byRunId.set(r.runId, r);
+  }
+
+  const buckets = new Map<string, RunRecord[]>();
+  for (const r of byRunId.values()) {
+    if (!r.cellId) continue;
+    const bucket = buckets.get(r.cellId) ?? [];
+    bucket.push(r);
+    buckets.set(r.cellId, bucket);
+  }
+
+  const out: CellAggregate[] = [];
+  for (const [cellId, cellRuns] of buckets) {
+    const first = cellRuns[0];
+    const metrics = cellRuns.map((r) => r.metrics).filter((m): m is NonNullable<typeof m> => !!m);
+    const passCount = cellRuns.filter(isSuccess).length;
+    const passRate = cellRuns.length > 0 ? passCount / cellRuns.length : 0;
+
+    const agg = (pick: (m: (typeof metrics)[number]) => number): NumericAggregate =>
+      numericAggregate(metrics.map(pick));
+
+    const cell = first.cell;
+    const total = agg((m) => m.totalTokens);
+    const cost = agg((m) => estimateCostUsd(cell.provider, m.inputTokens, m.outputTokens));
+
+    out.push({
+      cellId,
+      cell,
+      runs: cellRuns.length,
+      passRate,
+      failureCategories: summarizeCategories(cellRuns),
+      totalTokens: total,
+      inputTokens: agg((m) => m.inputTokens),
+      outputTokens: agg((m) => m.outputTokens),
+      maxContextTokens: agg((m) => m.maxContextTokens),
+      turns: agg((m) => m.turns),
+      toolCalls: agg((m) => m.toolCalls),
+      specComplianceRate: agg((m) => m.specComplianceRate),
+      llmTimeMs: agg((m) => m.llmTimeMs),
+      totalTimeMs: agg((m) => m.totalTimeMs),
+      timeToFirstToolCallMs: agg((m) => m.timeToFirstToolCallMs ?? Number.NaN),
+      transportBytes: agg((m) => m.transportBytesSent + m.transportBytesReceived),
+      costUsd: cost,
+      costPerSuccess: passCount > 0 ? (cost.mean * cellRuns.length) / passCount : Number.POSITIVE_INFINITY,
+      tokensPerSuccess: passCount > 0 ? (total.mean * cellRuns.length) / passCount : Number.POSITIVE_INFINITY,
+    });
+  }
+  return out;
+}
+
+export function numericAggregate(raw: number[]): NumericAggregate {
+  const values = raw.filter((v) => Number.isFinite(v));
+  const count = values.length;
+  if (count === 0) {
+    return { count: 0, mean: 0, median: 0, p95: 0, stdev: 0, min: 0, max: 0 };
+  }
+  const sorted = [...values].sort((a, b) => a - b);
+  const mean = values.reduce((a, b) => a + b, 0) / count;
+  const median = percentile(sorted, 0.5);
+  const p95 = percentile(sorted, 0.95);
+  const stdev = count > 1 ? Math.sqrt(values.reduce((acc, v) => acc + (v - mean) ** 2, 0) / (count - 1)) : 0;
+  const min = sorted[0];
+  const max = sorted[sorted.length - 1];
+  const agg: NumericAggregate = { count, mean, median, p95, stdev, min, max };
+  if (count >= 5) agg.ci95 = bootstrapCiMean(values);
+  return agg;
+}
+
+function percentile(sorted: number[], q: number): number {
+  if (sorted.length === 0) return 0;
+  if (sorted.length === 1) return sorted[0];
+  const idx = q * (sorted.length - 1);
+  const lo = Math.floor(idx);
+  const hi = Math.ceil(idx);
+  if (lo === hi) return sorted[lo];
+  return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
+}
+
+/**
+ * Nonparametric bootstrap 95% CI for the mean. Resamples with replacement
+ * 2000 times. Good enough for a dev-facing dashboard; swap in BCa later if we
+ * need bias correction.
+ */
+function bootstrapCiMean(values: number[]): [number, number] {
+  const B = 2000;
+  const means: number[] = new Array(B);
+  const n = values.length;
+  for (let b = 0; b < B; b++) {
+    let sum = 0;
+    for (let i = 0; i < n; i++) sum += values[(Math.random() * n) | 0];
+    means[b] = sum / n;
+  }
+  means.sort((a, b) => a - b);
+  return [means[Math.floor(B * 0.025)], means[Math.floor(B * 0.975)]];
+}
+
+/**
+ * Welch's t-test on two independent samples. Returns t and an approximate
+ * two-sided p-value using a normal approximation when either sample is
+ * small enough that the full t-distribution matters less than the rough
+ * signal. Good enough to colour dashboard rows; not good enough to publish.
+ */
+export function welchTTest(a: number[], b: number[]): { t: number; pTwoSided: number } | null {
+  const na = a.length;
+  const nb = b.length;
+  if (na < 2 || nb < 2) return null;
+  const meanA = a.reduce((x, y) => x + y, 0) / na;
+  const meanB = b.reduce((x, y) => x + y, 0) / nb;
+  const varA = a.reduce((acc, v) => acc + (v - meanA) ** 2, 0) / (na - 1);
+  const varB = b.reduce((acc, v) => acc + (v - meanB) ** 2, 0) / (nb - 1);
+  const se = Math.sqrt(varA / na + varB / nb);
+  if (se === 0) return { t: 0, pTwoSided: 1 };
+  const t = (meanA - meanB) / se;
+  // Normal approximation to the two-sided p-value.
+  const pTwoSided = 2 * (1 - phi(Math.abs(t)));
+  return { t, pTwoSided };
+}
+
+function phi(x: number): number {
+  // Abramowitz & Stegun 7.1.26 approximation for standard normal CDF.
+  const a1 = 0.254829592;
+  const a2 = -0.284496736;
+  const a3 = 1.421413741;
+  const a4 = -1.453152027;
+  const a5 = 1.061405429;
+  const p = 0.3275911;
+  const sign = x < 0 ? -1 : 1;
+  const ax = Math.abs(x) / Math.sqrt(2);
+  const t = 1 / (1 + p * ax);
+  const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-ax * ax);
+  return 0.5 * (1 + sign * y);
+}
+
+export function loadRuns(lines: string[]): RunRecord[] {
+  const out: RunRecord[] = [];
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    const obj = JSON.parse(trimmed);
+    if (obj && typeof obj === "object" && "cellId" in obj) out.push(obj as RunRecord);
+  }
+  return out;
+}
+
+export function categoryCounts(records: RunRecord[]) {
+  const out: Record<string, number> = {};
+  for (const r of records) {
+    for (const c of categorizeRun(r)) out[c] = (out[c] ?? 0) + 1;
+  }
+  return out;
+}
diff --git a/benchmarks/v2/metrics/taxonomy.ts b/benchmarks/v2/metrics/taxonomy.ts
new file mode 100644
index 0000000..9c93dd8
--- /dev/null
+++ b/benchmarks/v2/metrics/taxonomy.ts
@@ -0,0 +1,73 @@
+import type { CellMetrics, RunRecord } from "../runner/types.ts";
+
+/**
+ * Failure taxonomy — why did this run fall short? A run may land in multiple
+ * buckets (max_turns *and* verification failure) so we return a set.
+ *
+ * Categories:
+ * - `ok`                 — passed verification cleanly
+ * - `no_verifier`        — scenario has no verifier; we can't score it
+ * - `verify_fail`        — verifier returned passed=false
+ * - `max_turns`          — ran out of budget before finishing
+ * - `tool_unknown`       — agent called a tool that didn't exist on the tree at that moment
+ * - `tool_invoke_error`  — affordance was valid but invoke() threw
+ * - `tool_param_error`   — affordance was valid but args were malformed
+ * - `cell_exception`     — runner itself threw (network, server crash, …)
+ */
+export type FailureCategory =
+  | "ok"
+  | "no_verifier"
+  | "verify_fail"
+  | "max_turns"
+  | "tool_unknown"
+  | "tool_invoke_error"
+  | "tool_param_error"
+  | "cell_exception";
+
+export function categorizeRun(record: RunRecord): FailureCategory[] {
+  const cats = new Set<FailureCategory>();
+  if (record.error) cats.add("cell_exception");
+  const m = record.metrics;
+  if (!m) return Array.from(cats);
+
+  if (m.unknownToolCalls > 0) cats.add("tool_unknown");
+  if (m.invokeErrorCalls > 0) cats.add("tool_invoke_error");
+  if (m.paramErrorCalls > 0) cats.add("tool_param_error");
+  if (m.finishReason === "max_turns") cats.add("max_turns");
+
+  if (!m.verification) {
+    cats.add("no_verifier");
+  } else if (!m.verification.passed) {
+    cats.add("verify_fail");
+  }
+
+  if (m.verification?.passed && cats.size === 0) cats.add("ok");
+  if (cats.size === 0) cats.add("ok");
+  return Array.from(cats);
+}
+
+export function isSuccess(record: RunRecord): boolean {
+  if (record.error) return false;
+  return record.metrics?.verification?.passed === true;
+}
+
+export function summarizeCategories(records: RunRecord[]): Record<FailureCategory, number> {
+  const counts: Record<FailureCategory, number> = {
+    ok: 0,
+    no_verifier: 0,
+    verify_fail: 0,
+    max_turns: 0,
+    tool_unknown: 0,
+    tool_invoke_error: 0,
+    tool_param_error: 0,
+    cell_exception: 0,
+  };
+  for (const r of records) {
+    for (const cat of categorizeRun(r)) counts[cat] += 1;
+  }
+  return counts;
+}
+
+export function _cellMetricsForTypeCheck(_m: CellMetrics) {
+  // Exists so CellMetrics import isn't dropped — used by categorizeRun above via record.metrics.
+}
diff --git a/benchmarks/v2/package.json b/benchmarks/v2/package.json
new file mode 100644
index 0000000..b387bc2
--- /dev/null
+++ b/benchmarks/v2/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "slop-benchmarks-v2",
+  "private": true,
+  "type": "module",
+  "scripts": {
+    "smoke": "bun run smoke/provider-test.ts",
+    "sweep": "bun run run.ts",
+    "aggregate": "bun run metrics/aggregate.ts",
+    "dash": "bun run dashboard/serve.ts"
+  },
+  "dependencies": {
+    "@slop-ai/server": "workspace:*",
+    "@slop-ai/consumer": "workspace:*",
+    "@slop-ai/core": "workspace:*",
+    "@modelcontextprotocol/sdk": "^1.29.0"
+  },
+  "devDependencies": {
+    "bun-types": "^1.3.11"
+  }
+}
diff --git a/benchmarks/v2/providers/openai-compat.ts b/benchmarks/v2/providers/openai-compat.ts
new file mode 100644
index 0000000..6a681cc
--- /dev/null
+++ b/benchmarks/v2/providers/openai-compat.ts
@@ -0,0 +1,184 @@
+import type {
+  ChatMessage,
+  FinishReason,
+  GenerateRequest,
+  GenerateResponse,
+  LlmProvider,
+  ToolCall,
+} from "./types.ts";
+
+export interface OpenAICompatOptions {
+  baseUrl: string;
+  model: string;
+  apiKey?: string;
+  id?: string;
+  requestTimeoutMs?: number;
+}
+
+interface OpenAIToolCall {
+  id: string;
+  type: "function";
+  function: { name: string; arguments: string };
+}
+
+interface OpenAIMessage {
+  role: "system" | "user" | "assistant" | "tool";
+  content: string | null;
+  tool_calls?: OpenAIToolCall[];
+  tool_call_id?: string;
+  name?: string;
+}
+
+interface OpenAIChatResponse {
+  choices: Array<{
+    index: number;
+    message: OpenAIMessage;
+    finish_reason: string | null;
+  }>;
+  usage?: {
+    prompt_tokens?: number;
+    completion_tokens?: number;
+    total_tokens?: number;
+  };
+}
+
+export class OpenAICompatProvider implements LlmProvider {
+  readonly id: string;
+  readonly model: string;
+  private readonly baseUrl: string;
+  private readonly apiKey: string;
+  private readonly timeoutMs: number;
+
+  constructor(opts: OpenAICompatOptions) {
+    this.baseUrl = opts.baseUrl.replace(/\/$/, "");
+    this.model = opts.model;
+    this.apiKey = opts.apiKey ?? "dummy-key";
+    this.id = opts.id ?? `openai-compat:${opts.model}`;
+    this.timeoutMs = opts.requestTimeoutMs ?? 180_000;
+  }
+
+  async generate(req: GenerateRequest): Promise<GenerateResponse> {
+    const messages: OpenAIMessage[] = [
+      { role: "system", content: req.systemPrompt },
+      ...req.messages.map(toOpenAIMessage),
+    ];
+
+    const body = {
+      model: this.model,
+      messages,
+      tools: req.tools.map((t) => ({
+        type: "function" as const,
+        function: {
+          name: t.name,
+          description: t.description,
+          parameters: t.parameters,
+        },
+      })),
+      tool_choice: req.tools.length > 0 ? "auto" : undefined,
+      temperature: req.temperature ?? 0,
+      max_tokens: req.maxTokens,
+    };
+
+    const t0 = performance.now();
+    const controller = new AbortController();
+    const timer = setTimeout(() => controller.abort(), this.timeoutMs);
+    let res: Response;
+    try {
+      res = await fetch(`${this.baseUrl}/chat/completions`, {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Bearer ${this.apiKey}`,
+        },
+        body: JSON.stringify(body),
+        signal: controller.signal,
+      });
+    } finally {
+      clearTimeout(timer);
+    }
+    const rawLatencyMs = performance.now() - t0;
+
+    if (!res.ok) {
+      const text = await res.text().catch(() => "");
+      throw new Error(`OpenAI-compat ${this.baseUrl} ${res.status}: ${text.slice(0, 500)}`);
+    }
+
+    const json = (await res.json()) as OpenAIChatResponse;
+    const choice = json.choices[0];
+    if (!choice) throw new Error("OpenAI-compat response has no choices");
+    const message = fromOpenAIMessage(choice.message);
+    const finishReason = normaliseFinishReason(choice.finish_reason);
+    const usage = {
+      inputTokens: json.usage?.prompt_tokens ?? 0,
+      outputTokens: json.usage?.completion_tokens ?? 0,
+      totalTokens:
+        json.usage?.total_tokens ??
+        (json.usage?.prompt_tokens ?? 0) + (json.usage?.completion_tokens ?? 0),
+    };
+
+    return { message, usage, finishReason, rawLatencyMs };
+  }
+}
+
+function toOpenAIMessage(m: ChatMessage): OpenAIMessage {
+  if (m.role === "assistant" && m.toolCalls && m.toolCalls.length > 0) {
+    return {
+      role: "assistant",
+      content: m.content,
+      tool_calls: m.toolCalls.map((c) => ({
+        id: c.id,
+        type: "function",
+        function: { name: c.name, arguments: JSON.stringify(c.arguments) },
+      })),
+    };
+  }
+  if (m.role === "tool") {
+    return {
+      role: "tool",
+      content: m.content,
+      tool_call_id: m.toolCallId,
+      name: m.name,
+    };
+  }
+  return { role: m.role, content: m.content };
+}
+
+function fromOpenAIMessage(m: OpenAIMessage): ChatMessage {
+  const toolCalls: ToolCall[] | undefined = m.tool_calls?.map((c) => ({
+    id: c.id,
+    name: c.function.name,
+    arguments: parseArgs(c.function.arguments),
+  }));
+  return {
+    role: (m.role as ChatMessage["role"]) ?? "assistant",
+    content: m.content ?? "",
+    toolCalls: toolCalls && toolCalls.length > 0 ? toolCalls : undefined,
+  };
+}
+
+function parseArgs(raw: string): Record<string, unknown> {
+  if (!raw) return {};
+  try {
+    const parsed = JSON.parse(raw);
+    return typeof parsed === "object" && parsed !== null ? (parsed as Record<string, unknown>) : { value: parsed };
+  } catch {
+    return { _raw: raw };
+  }
+}
+
+function normaliseFinishReason(raw: string | null): FinishReason {
+  switch (raw) {
+    case "tool_calls":
+    case "function_call":
+      return "tool_calls";
+    case "length":
+    case "max_tokens":
+      return "length";
+    case "stop":
+    case "end_turn":
+    case null:
+      return "stop";
+    default:
+      return "stop";
+  }
+}
diff --git a/benchmarks/v2/providers/types.ts b/benchmarks/v2/providers/types.ts
new file mode 100644
index 0000000..87f7506
--- /dev/null
+++ b/benchmarks/v2/providers/types.ts
@@ -0,0 +1,50 @@
+export type ChatRole = "system" | "user" | "assistant" | "tool";
+
+export interface ToolCall {
+  id: string;
+  name: string;
+  arguments: Record<string, unknown>;
+}
+
+export interface ChatMessage {
+  role: ChatRole;
+  content: string;
+  toolCalls?: ToolCall[];
+  toolCallId?: string;
+  name?: string;
+}
+
+export interface ToolDef {
+  name: string;
+  description: string;
+  parameters: Record<string, unknown>;
+}
+
+export interface LlmUsage {
+  inputTokens: number;
+  outputTokens: number;
+  totalTokens: number;
+}
+
+export type FinishReason = "stop" | "tool_calls" | "length" | "error";
+
+export interface GenerateRequest {
+  systemPrompt: string;
+  messages: ChatMessage[];
+  tools: ToolDef[];
+  temperature?: number;
+  maxTokens?: number;
+}
+
+export interface GenerateResponse {
+  message: ChatMessage;
+  usage: LlmUsage;
+  finishReason: FinishReason;
+  rawLatencyMs: number;
+}
+
+export interface LlmProvider {
+  readonly id: string;
+  readonly model: string;
+  generate(req: GenerateRequest): Promise<GenerateResponse>;
+}
diff --git a/benchmarks/v2/results/ablation-prompts-encodings/runs.jsonl b/benchmarks/v2/results/ablation-prompts-encodings/runs.jsonl
new file mode 100644
index 0000000..28c2e38
--- /dev/null
+++ b/benchmarks/v2/results/ablation-prompts-encodings/runs.jsonl
@@ -0,0 +1,13 @@
+{"type":"sweep","config":{"id":"ablation-prompts-encodings","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["minimal","spec","spec-terse"],"encodingVariants":["indented-text","json-compact","markdown-headings"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat","flat+prompt"],"apps":["todo"],"dataScales":["s"],"scenarioFilter":["mark-all-done"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":30,"temperature":0},"startedAt":"2026-04-15T13:14:03.671Z"}
+{"sweepId":"ablation-prompts-encodings","cellId":"6bff88133d655644","runId":"ablation-prompts-encodings:6bff88133d655644:0","configHash":"6bff88133d655644","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"minimal","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:14:03.672Z","durationMs":51585.324666,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":2757,"outputTokens":442,"totalTokens":3199,"maxContextTokens":1456,"timeToFirstToolCallMs":33147.140999999996,"setupTimeMs":2.0802090000000106,"llmTimeMs":51566.906832999994,"totalTimeMs":51584.657750000006,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":1301,"outputTokens":262,"latencyMs":33147.126874999994,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":1456,"outputTokens":180,"latencyMs":18419.779958,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"0c2aa43697715c54","runId":"ablation-prompts-encodings:0c2aa43697715c54:0","configHash":"0c2aa43697715c54","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"minimal","encoding":"json-compact","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:14:55.258Z","durationMs":55335.918750000004,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":5597,"outputTokens":516,"totalTokens":6113,"maxContextTokens":2876,"timeToFirstToolCallMs":37305.58974999999,"setupTimeMs":2.161500000001979,"llmTimeMs":55322.11345799999,"totalTimeMs":55335.858958,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2721,"outputTokens":341,"latencyMs":37305.579207999996,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":2876,"outputTokens":175,"latencyMs":18016.534249999997,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"7de01e731db22914","runId":"ablation-prompts-encodings:7de01e731db22914:0","configHash":"7de01e731db22914","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"minimal","encoding":"markdown-headings","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:15:50.594Z","durationMs":137832.292292,"metrics":{"turns":4,"toolCalls":12,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":6,"inputTokens":8458,"outputTokens":1357,"totalTokens":9815,"maxContextTokens":2287,"timeToFirstToolCallMs":35589.190791000015,"setupTimeMs":2.3482090000034077,"llmTimeMs":137818.866334,"totalTimeMs":137832.177208,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":0.5,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":1868,"outputTokens":337,"latencyMs":35589.181291999994,"toolCalls":6,"toolCallKinds":["param_error","param_error","param_error","param_error","param_error","param_error"]},{"index":1,"inputTokens":2132,"outputTokens":622,"latencyMs":61173.34299999999,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":2,"inputTokens":2171,"outputTokens":392,"latencyMs":39279.158041999995,"toolCalls":5,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance"]},{"index":3,"inputTokens":2287,"outputTokens":6,"latencyMs":1777.1840000000084,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"9dae0af9aee37f10","runId":"ablation-prompts-encodings:9dae0af9aee37f10:0","configHash":"9dae0af9aee37f10","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:18:08.427Z","durationMs":47738.54904099999,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":4595,"outputTokens":448,"totalTokens":5043,"maxContextTokens":2375,"timeToFirstToolCallMs":30023.628457999963,"setupTimeMs":2.592833000002429,"llmTimeMs":47719.666708000004,"totalTimeMs":47738.48087500001,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2220,"outputTokens":275,"latencyMs":30023.612250000006,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":2375,"outputTokens":173,"latencyMs":17696.054458,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"96e02e4e7de9a3b7","runId":"ablation-prompts-encodings:96e02e4e7de9a3b7:0","configHash":"96e02e4e7de9a3b7","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"json-compact","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:18:56.165Z","durationMs":56172.179000000004,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":7435,"outputTokens":522,"totalTokens":7957,"maxContextTokens":3795,"timeToFirstToolCallMs":38336.83799999999,"setupTimeMs":1.1343749999650754,"llmTimeMs":56164.567332999955,"totalTimeMs":56172.16070800001,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3640,"outputTokens":349,"latencyMs":38336.81191599998,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":3795,"outputTokens":173,"latencyMs":17827.75541699998,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"e0ef4d76686f08ea","runId":"ablation-prompts-encodings:e0ef4d76686f08ea:0","configHash":"e0ef4d76686f08ea","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"markdown-headings","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:42:39.360Z","durationMs":333329.89158299996,"metrics":{"turns":8,"toolCalls":16,"navigationToolCalls":2,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":8,"inputTokens":32583,"outputTokens":3029,"totalTokens":35612,"maxContextTokens":5844,"timeToFirstToolCallMs":42380.975875000004,"setupTimeMs":1.8915829999999971,"llmTimeMs":333310.26345999993,"totalTimeMs":333329.25654200005,"transportBytesSent":486,"transportBytesReceived":7378,"specComplianceRate":0.42857142857142855,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2787,"outputTokens":333,"latencyMs":42380.956999999995,"toolCalls":6,"toolCallKinds":["param_error","param_error","param_error","param_error","param_error","param_error"]},{"index":1,"inputTokens":3051,"outputTokens":351,"latencyMs":35039.515416999995,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":2,"inputTokens":3095,"outputTokens":449,"latencyMs":44300.983374999996,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":3,"inputTokens":4370,"outputTokens":218,"latencyMs":25731.645292,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":4,"inputTokens":4414,"outputTokens":963,"latencyMs":104567.07620899999,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":5,"inputTokens":4453,"outputTokens":502,"latencyMs":53954.17258300001,"toolCalls":5,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance"]},{"index":6,"inputTokens":4569,"outputTokens":207,"latencyMs":24500.79916699999,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":7,"inputTokens":5844,"outputTokens":6,"latencyMs":2835.114416999975,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"1254f7f4faea113f","runId":"ablation-prompts-encodings:1254f7f4faea113f:0","configHash":"1254f7f4faea113f","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec-terse","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:48:12.691Z","durationMs":45359.022791000025,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":3071,"outputTokens":436,"totalTokens":3507,"maxContextTokens":1613,"timeToFirstToolCallMs":27565.84154200001,"setupTimeMs":2.5538329999544658,"llmTimeMs":45341.406999,"totalTimeMs":45358.954792000004,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":1458,"outputTokens":262,"latencyMs":27565.833333000017,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":1613,"outputTokens":174,"latencyMs":17775.573665999982,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"0d9f06f9023716ff","runId":"ablation-prompts-encodings:0d9f06f9023716ff:0","configHash":"0d9f06f9023716ff","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec-terse","encoding":"json-compact","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:48:58.051Z","durationMs":74536.82708299998,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":1,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":11137,"outputTokens":655,"totalTokens":11792,"maxContextTokens":5226,"timeToFirstToolCallMs":35114.534208,"setupTimeMs":2.117917000025045,"llmTimeMs":74523.08412600006,"totalTimeMs":74535.90179199999,"transportBytesSent":486,"transportBytesReceived":8573,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2878,"outputTokens":318,"latencyMs":35114.518209,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":3033,"outputTokens":169,"latencyMs":17403.150667000038,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":2,"inputTokens":5226,"outputTokens":168,"latencyMs":22005.41525000002,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"2d4ee9f9bcedd11f","runId":"ablation-prompts-encodings:2d4ee9f9bcedd11f:0","configHash":"2d4ee9f9bcedd11f","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec-terse","encoding":"markdown-headings","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:50:12.590Z","durationMs":483956.34754199994,"metrics":{"turns":9,"toolCalls":17,"navigationToolCalls":2,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":9,"inputTokens":29054,"outputTokens":4669,"totalTokens":33723,"maxContextTokens":5121,"timeToFirstToolCallMs":37194.10391599999,"setupTimeMs":2.271749999956228,"llmTimeMs":483941.40420899994,"totalTimeMs":483956.22274999996,"transportBytesSent":486,"transportBytesReceived":7378,"specComplianceRate":0.4,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2025,"outputTokens":352,"latencyMs":37194.09091700002,"toolCalls":6,"toolCallKinds":["param_error","param_error","param_error","param_error","param_error","param_error"]},{"index":1,"inputTokens":2289,"outputTokens":510,"latencyMs":50299.15070900001,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":2,"inputTokens":2333,"outputTokens":515,"latencyMs":50488.169292000006,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":3,"inputTokens":2372,"outputTokens":1082,"latencyMs":105892.05133399996,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":4,"inputTokens":3647,"outputTokens":854,"latencyMs":89824.584333,"toolCalls":1,"toolCallKinds":["param_error"]},{"index":5,"inputTokens":3691,"outputTokens":950,"latencyMs":101650.54083299998,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":6,"inputTokens":3730,"outputTokens":224,"latencyMs":25251.111874999944,"toolCalls":5,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance"]},{"index":7,"inputTokens":3846,"outputTokens":176,"latencyMs":20583.621374999988,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":8,"inputTokens":5121,"outputTokens":6,"latencyMs":2758.083541000029,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"5897a9956995fa0d","runId":"ablation-prompts-encodings:5897a9956995fa0d:0","configHash":"5897a9956995fa0d","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:58:16.547Z","durationMs":34546.89762499998,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":2611,"outputTokens":337,"totalTokens":2948,"maxContextTokens":1151,"timeToFirstToolCallMs":9433.217209000024,"setupTimeMs":86.30054099997506,"llmTimeMs":34428.401083000004,"totalTimeMs":34544.17337500001,"transportBytesSent":324,"transportBytesReceived":1102,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":549,"outputTokens":90,"latencyMs":9433.19070799998,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":911,"outputTokens":241,"latencyMs":23952.858125000028,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":2,"inputTokens":1151,"outputTokens":6,"latencyMs":1042.3522499999963,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"ablation-prompts-encodings","cellId":"cc35c74e8c023f04","runId":"ablation-prompts-encodings:cc35c74e8c023f04:0","configHash":"cc35c74e8c023f04","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat+prompt","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T13:58:51.094Z","durationMs":1.0311659999424592,"error":"todo: MCP variant \"flat+prompt\" not yet implemented\nError: todo: MCP variant \"flat+prompt\" not yet implemented\n    at startMcpServer (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/apps/todo/index.ts:46:17)\n    at startMcpServer (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/apps/todo/index.ts:44:24)\n    at runMcpCell (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/runner/mcp-cell.ts:41:28)\n    at runMcpCell (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/runner/mcp-cell.ts:12:33)\n    at runSweep (/Users/carlid/dev/slop-slop-slop/benchmarks/v2/runner/sweep.ts:106:31)\n    at processTicksAndRejections (native:7:39)"}
+{"sweepId":"ablation-prompts-encodings","cellId":"cc35c74e8c023f04","runId":"ablation-prompts-encodings:cc35c74e8c023f04:0","configHash":"cc35c74e8c023f04","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat+prompt","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T14:00:16.743Z","durationMs":31581.046041999998,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":3220,"outputTokens":300,"totalTokens":3520,"maxContextTokens":1354,"timeToFirstToolCallMs":6410.151417,"setupTimeMs":60.256874999999994,"llmTimeMs":31495.081041,"totalTimeMs":31580.590166,"transportBytesSent":324,"transportBytesReceived":1102,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":752,"outputTokens":53,"latencyMs":6410.142416,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":1114,"outputTokens":241,"latencyMs":24035.955083,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":2,"inputTokens":1354,"outputTokens":6,"latencyMs":1048.9835420000018,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
diff --git a/benchmarks/v2/results/smoke-crm/runs.jsonl b/benchmarks/v2/results/smoke-crm/runs.jsonl
new file mode 100644
index 0000000..aeda59c
--- /dev/null
+++ b/benchmarks/v2/results/smoke-crm/runs.jsonl
@@ -0,0 +1,3 @@
+{"type":"sweep","config":{"id":"smoke-crm","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["crm"],"dataScales":["s"],"scenarioFilter":["high-value-alert"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":40,"temperature":0},"startedAt":"2026-04-15T12:36:01.444Z"}
+{"sweepId":"smoke-crm","cellId":"ca3a9f06545dbc26","runId":"smoke-crm:ca3a9f06545dbc26:0","configHash":"ca3a9f06545dbc26","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"crm","scale":"s","scenario":"high-value-alert","seed":42,"iteration":0},"startedAt":"2026-04-15T12:36:01.445Z","durationMs":65072.22833300001,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8074,"outputTokens":549,"totalTokens":8623,"maxContextTokens":4217,"timeToFirstToolCallMs":62162.816999999995,"setupTimeMs":2.2873750000000115,"llmTimeMs":65049.417958,"totalTimeMs":65071.560084,"transportBytesSent":828,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3857,"outputTokens":543,"latencyMs":62162.801583,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":4217,"outputTokens":6,"latencyMs":2886.616374999998,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":6,"passedChecks":6,"failures":[]}}}
+{"sweepId":"smoke-crm","cellId":"e43d5960201b9cf1","runId":"smoke-crm:e43d5960201b9cf1:0","configHash":"e43d5960201b9cf1","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"crm","scale":"s","scenario":"high-value-alert","seed":42,"iteration":0},"startedAt":"2026-04-15T12:37:06.519Z","durationMs":141165.18675,"metrics":{"turns":4,"toolCalls":13,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":6,"paramErrorCalls":0,"inputTokens":5579,"outputTokens":1420,"totalTokens":6999,"maxContextTokens":2017,"timeToFirstToolCallMs":11841.185708999998,"setupTimeMs":81.94433299999946,"llmTimeMs":141050.49129200002,"totalTimeMs":141164.1875,"transportBytesSent":1272,"transportBytesReceived":1732,"specComplianceRate":0.5384615384615384,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":872,"outputTokens":110,"latencyMs":11841.171208999993,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":1183,"outputTokens":677,"latencyMs":66173.42120800001,"toolCalls":6,"toolCallKinds":["invoke_error","invoke_error","invoke_error","invoke_error","invoke_error","invoke_error"]},{"index":2,"inputTokens":1507,"outputTokens":627,"latencyMs":61510.479500000016,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":3,"inputTokens":2017,"outputTokens":6,"latencyMs":1525.4193749999977,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":6,"passedChecks":6,"failures":[]}}}
diff --git a/benchmarks/v2/results/smoke-file-browser/runs.jsonl b/benchmarks/v2/results/smoke-file-browser/runs.jsonl
new file mode 100644
index 0000000..1c673dc
--- /dev/null
+++ b/benchmarks/v2/results/smoke-file-browser/runs.jsonl
@@ -0,0 +1,3 @@
+{"type":"sweep","config":{"id":"smoke-file-browser","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["file-browser"],"dataScales":["s"],"scenarioFilter":["delete-empty-dirs"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":30,"temperature":0},"startedAt":"2026-04-15T12:42:04.925Z"}
+{"sweepId":"smoke-file-browser","cellId":"bea6d0f1f4d3dde4","runId":"smoke-file-browser:bea6d0f1f4d3dde4:0","configHash":"bea6d0f1f4d3dde4","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"file-browser","scale":"s","scenario":"delete-empty-dirs","seed":42,"iteration":0},"startedAt":"2026-04-15T12:42:04.926Z","durationMs":214138.625,"metrics":{"turns":3,"toolCalls":2,"navigationToolCalls":1,"affordanceToolCalls":1,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":10442,"outputTokens":2000,"totalTokens":12442,"maxContextTokens":4451,"timeToFirstToolCallMs":44267.387458000005,"setupTimeMs":2.2589169999999967,"llmTimeMs":214127.14291700002,"totalTimeMs":214137.983083,"transportBytesSent":52,"transportBytesReceived":3741,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2982,"outputTokens":405,"latencyMs":44267.379417000004,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":3009,"outputTokens":374,"latencyMs":36754.872083,"toolCalls":1,"toolCallKinds":["slop_get_state"]},{"index":2,"inputTokens":4451,"outputTokens":1221,"latencyMs":133104.891417,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"smoke-file-browser","cellId":"566bedc1676bb5d4","runId":"smoke-file-browser:566bedc1676bb5d4:0","configHash":"566bedc1676bb5d4","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"file-browser","scale":"s","scenario":"delete-empty-dirs","seed":42,"iteration":0},"startedAt":"2026-04-15T12:45:39.065Z","durationMs":129256.30066599997,"metrics":{"turns":5,"toolCalls":4,"navigationToolCalls":0,"affordanceToolCalls":4,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":4236,"outputTokens":1317,"totalTokens":5553,"maxContextTokens":1163,"timeToFirstToolCallMs":16461.97116699998,"setupTimeMs":69.93870899998001,"llmTimeMs":129165.43254200005,"totalTimeMs":129255.48000000001,"transportBytesSent":168,"transportBytesReceived":1348,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":548,"outputTokens":164,"latencyMs":16461.958165999997,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":757,"outputTokens":212,"latencyMs":20751.344209000003,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":2,"inputTokens":791,"outputTokens":279,"latencyMs":27032.005584000028,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":3,"inputTokens":977,"outputTokens":331,"latencyMs":32366.21983300004,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":4,"inputTokens":1163,"outputTokens":331,"latencyMs":32553.904749999987,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
diff --git a/benchmarks/v2/results/smoke-mcp/runs.jsonl b/benchmarks/v2/results/smoke-mcp/runs.jsonl
new file mode 100644
index 0000000..06a6b41
--- /dev/null
+++ b/benchmarks/v2/results/smoke-mcp/runs.jsonl
@@ -0,0 +1,3 @@
+{"type":"sweep","config":{"id":"smoke-mcp","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["issue-tracker"],"dataScales":["s"],"scenarioFilter":["explore-and-act"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":20,"temperature":0},"startedAt":"2026-04-15T12:12:19.983Z"}
+{"sweepId":"smoke-mcp","cellId":"785d02dabebe52d5","runId":"smoke-mcp:785d02dabebe52d5:0","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T12:12:19.984Z","durationMs":60390.156749999995,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":432,"totalTokens":10035,"maxContextTokens":4895,"timeToFirstToolCallMs":42213.346625,"setupTimeMs":2.596624999999996,"llmTimeMs":60372.30433299999,"totalTimeMs":60389.474375,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":271,"latencyMs":42213.33775,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":161,"latencyMs":18158.966582999994,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
+{"sweepId":"smoke-mcp","cellId":"eef232b363ac08b8","runId":"smoke-mcp:eef232b363ac08b8:0","configHash":"eef232b363ac08b8","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T12:13:20.375Z","durationMs":33718.141541000005,"metrics":{"turns":4,"toolCalls":4,"navigationToolCalls":0,"affordanceToolCalls":4,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":5074,"outputTokens":315,"totalTokens":5389,"maxContextTokens":1590,"timeToFirstToolCallMs":10728.555625,"setupTimeMs":80.23566600000049,"llmTimeMs":33611.61062399999,"totalTimeMs":33717.270083,"transportBytesSent":368,"transportBytesReceived":1541,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":997,"outputTokens":95,"latencyMs":10728.548500000004,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":1078,"outputTokens":64,"latencyMs":6545.733207999991,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":2,"inputTokens":1409,"outputTokens":150,"latencyMs":15321.989333000005,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":3,"inputTokens":1590,"outputTokens":6,"latencyMs":1015.3395829999936,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
diff --git a/benchmarks/v2/results/smoke-todo/runs.jsonl b/benchmarks/v2/results/smoke-todo/runs.jsonl
new file mode 100644
index 0000000..16ddbfd
--- /dev/null
+++ b/benchmarks/v2/results/smoke-todo/runs.jsonl
@@ -0,0 +1,3 @@
+{"type":"sweep","config":{"id":"smoke-todo","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off"],"protocols":["slop","mcp"],"mcpVariants":["flat"],"apps":["todo"],"dataScales":["s"],"scenarioFilter":["mark-all-done"],"seeds":[42],"iterations":1,"maxConcurrency":1,"maxTurns":30,"temperature":0},"startedAt":"2026-04-15T12:17:21.029Z"}
+{"sweepId":"smoke-todo","cellId":"9dae0af9aee37f10","runId":"smoke-todo:9dae0af9aee37f10:0","configHash":"9dae0af9aee37f10","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T12:17:21.030Z","durationMs":48185.751083,"metrics":{"turns":2,"toolCalls":6,"navigationToolCalls":0,"affordanceToolCalls":6,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":4595,"outputTokens":448,"totalTokens":5043,"maxContextTokens":2375,"timeToFirstToolCallMs":30390.164292,"setupTimeMs":2.1136669999999924,"llmTimeMs":48174.050042999996,"totalTimeMs":48185.067708,"transportBytesSent":486,"transportBytesReceived":402,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":2220,"outputTokens":275,"latencyMs":30390.157584,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":1,"inputTokens":2375,"outputTokens":173,"latencyMs":17783.892459,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
+{"sweepId":"smoke-todo","cellId":"5897a9956995fa0d","runId":"smoke-todo:5897a9956995fa0d:0","configHash":"5897a9956995fa0d","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"n/a","encoding":"n/a","optimization":"n/a","protocol":"mcp","mcpVariant":"flat","app":"todo","scale":"s","scenario":"mark-all-done","seed":42,"iteration":0},"startedAt":"2026-04-15T12:18:09.216Z","durationMs":34703.892374999996,"metrics":{"turns":3,"toolCalls":7,"navigationToolCalls":0,"affordanceToolCalls":7,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":2611,"outputTokens":337,"totalTokens":2948,"maxContextTokens":1151,"timeToFirstToolCallMs":9442.800042000003,"setupTimeMs":81.44879199999559,"llmTimeMs":34596.018457999984,"totalTimeMs":34702.949875,"transportBytesSent":324,"transportBytesReceived":1102,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":549,"outputTokens":90,"latencyMs":9442.779665999995,"toolCalls":1,"toolCallKinds":["affordance"]},{"index":1,"inputTokens":911,"outputTokens":241,"latencyMs":24044.363457999993,"toolCalls":6,"toolCallKinds":["affordance","affordance","affordance","affordance","affordance","affordance"]},{"index":2,"inputTokens":1151,"outputTokens":6,"latencyMs":1108.8753339999967,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":1,"passedChecks":1,"failures":[]}}}
diff --git a/benchmarks/v2/results/smoke/aggregated.json b/benchmarks/v2/results/smoke/aggregated.json
new file mode 100644
index 0000000..617b061
--- /dev/null
+++ b/benchmarks/v2/results/smoke/aggregated.json
@@ -0,0 +1,288 @@
+{
+  "source": "results/smoke/runs.jsonl",
+  "runs": 6,
+  "cells": [
+    {
+      "cellId": "785d02dabebe52d5",
+      "cell": {
+        "provider": {
+          "kind": "openai-compat",
+          "baseUrl": "http://slopinator-s-1.local:11434/v1",
+          "model": "gemma4:31b"
+        },
+        "prompt": "spec",
+        "encoding": "indented-text",
+        "optimization": "off",
+        "protocol": "slop",
+        "app": "issue-tracker",
+        "scale": "s",
+        "scenario": "explore-and-act",
+        "seed": 42,
+        "iteration": 0
+      },
+      "runs": 3,
+      "passRate": 1,
+      "failureCategories": {
+        "ok": 3,
+        "no_verifier": 0,
+        "verify_fail": 0,
+        "max_turns": 0,
+        "tool_unknown": 0,
+        "tool_invoke_error": 0,
+        "tool_param_error": 0,
+        "cell_exception": 0
+      },
+      "totalTokens": {
+        "count": 3,
+        "mean": 10101.666666666666,
+        "median": 10134,
+        "p95": 10135.8,
+        "stdev": 57.74368652357877,
+        "min": 10035,
+        "max": 10136
+      },
+      "inputTokens": {
+        "count": 3,
+        "mean": 9603,
+        "median": 9603,
+        "p95": 9603,
+        "stdev": 0,
+        "min": 9603,
+        "max": 9603
+      },
+      "outputTokens": {
+        "count": 3,
+        "mean": 498.6666666666667,
+        "median": 531,
+        "p95": 532.8,
+        "stdev": 57.74368652357878,
+        "min": 432,
+        "max": 533
+      },
+      "maxContextTokens": {
+        "count": 3,
+        "mean": 4895,
+        "median": 4895,
+        "p95": 4895,
+        "stdev": 0,
+        "min": 4895,
+        "max": 4895
+      },
+      "turns": {
+        "count": 3,
+        "mean": 2,
+        "median": 2,
+        "p95": 2,
+        "stdev": 0,
+        "min": 2,
+        "max": 2
+      },
+      "toolCalls": {
+        "count": 3,
+        "mean": 2,
+        "median": 2,
+        "p95": 2,
+        "stdev": 0,
+        "min": 2,
+        "max": 2
+      },
+      "specComplianceRate": {
+        "count": 3,
+        "mean": 1,
+        "median": 1,
+        "p95": 1,
+        "stdev": 0,
+        "min": 1,
+        "max": 1
+      },
+      "llmTimeMs": {
+        "count": 3,
+        "mean": 59255.96312499999,
+        "median": 58775.35249999999,
+        "p95": 60187.3716125,
+        "stdev": 944.6343587235024,
+        "min": 58648.274249999995,
+        "max": 60344.262625
+      },
+      "totalTimeMs": {
+        "count": 3,
+        "mean": 59271.172083666665,
+        "median": 58788.625958000004,
+        "p95": 60204.2368589,
+        "stdev": 946.3498279522337,
+        "min": 58663.363333999994,
+        "max": 60361.526959
+      },
+      "timeToFirstToolCallMs": {
+        "count": 3,
+        "mean": 41278.56054200001,
+        "median": 40849.855666999996,
+        "p95": 42089.443066700005,
+        "stdev": 822.7889262352226,
+        "min": 40758.65095900002,
+        "max": 42227.175
+      },
+      "transportBytes": {
+        "count": 3,
+        "mean": 725,
+        "median": 725,
+        "p95": 725,
+        "stdev": 0,
+        "min": 725,
+        "max": 725
+      },
+      "costUsd": {
+        "count": 3,
+        "mean": 0,
+        "median": 0,
+        "p95": 0,
+        "stdev": 0,
+        "min": 0,
+        "max": 0
+      },
+      "costPerSuccess": 0,
+      "tokensPerSuccess": 10101.666666666666
+    },
+    {
+      "cellId": "fbc384d4e3ccd4ae",
+      "cell": {
+        "provider": {
+          "kind": "openai-compat",
+          "baseUrl": "http://slopinator-s-1.local:11434/v1",
+          "model": "gemma4:31b"
+        },
+        "prompt": "spec",
+        "encoding": "indented-text",
+        "optimization": "combined",
+        "protocol": "slop",
+        "app": "issue-tracker",
+        "scale": "s",
+        "scenario": "explore-and-act",
+        "seed": 42,
+        "iteration": 0
+      },
+      "runs": 3,
+      "passRate": 1,
+      "failureCategories": {
+        "ok": 3,
+        "no_verifier": 0,
+        "verify_fail": 0,
+        "max_turns": 0,
+        "tool_unknown": 0,
+        "tool_invoke_error": 0,
+        "tool_param_error": 0,
+        "cell_exception": 0
+      },
+      "totalTokens": {
+        "count": 3,
+        "mean": 8612,
+        "median": 8612,
+        "p95": 8612,
+        "stdev": 0,
+        "min": 8612,
+        "max": 8612
+      },
+      "inputTokens": {
+        "count": 3,
+        "mean": 8115,
+        "median": 8115,
+        "p95": 8115,
+        "stdev": 0,
+        "min": 8115,
+        "max": 8115
+      },
+      "outputTokens": {
+        "count": 3,
+        "mean": 497,
+        "median": 497,
+        "p95": 497,
+        "stdev": 0,
+        "min": 497,
+        "max": 497
+      },
+      "maxContextTokens": {
+        "count": 3,
+        "mean": 4151,
+        "median": 4151,
+        "p95": 4151,
+        "stdev": 0,
+        "min": 4151,
+        "max": 4151
+      },
+      "turns": {
+        "count": 3,
+        "mean": 2,
+        "median": 2,
+        "p95": 2,
+        "stdev": 0,
+        "min": 2,
+        "max": 2
+      },
+      "toolCalls": {
+        "count": 3,
+        "mean": 2,
+        "median": 2,
+        "p95": 2,
+        "stdev": 0,
+        "min": 2,
+        "max": 2
+      },
+      "specComplianceRate": {
+        "count": 3,
+        "mean": 1,
+        "median": 1,
+        "p95": 1,
+        "stdev": 0,
+        "min": 1,
+        "max": 1
+      },
+      "llmTimeMs": {
+        "count": 3,
+        "mean": 55551.42805566667,
+        "median": 54098.86620800002,
+        "p95": 58073.06235890001,
+        "stdev": 2566.380920587207,
+        "min": 54040.778249999974,
+        "max": 58514.63970900001
+      },
+      "totalTimeMs": {
+        "count": 3,
+        "mean": 55564.67487466668,
+        "median": 54111.153208,
+        "p95": 58087.66363330001,
+        "stdev": 2567.7747776995475,
+        "min": 54053.373291000025,
+        "max": 58529.49812500001
+      },
+      "timeToFirstToolCallMs": {
+        "count": 3,
+        "mean": 32601.769805333333,
+        "median": 31149.366374999983,
+        "p95": 35087.656049400015,
+        "stdev": 2531.8199430530817,
+        "min": 31130.699250000005,
+        "max": 35525.243791000015
+      },
+      "transportBytes": {
+        "count": 3,
+        "mean": 725,
+        "median": 725,
+        "p95": 725,
+        "stdev": 0,
+        "min": 725,
+        "max": 725
+      },
+      "costUsd": {
+        "count": 3,
+        "mean": 0,
+        "median": 0,
+        "p95": 0,
+        "stdev": 0,
+        "min": 0,
+        "max": 0
+      },
+      "costPerSuccess": 0,
+      "tokensPerSuccess": 8612
+    }
+  ]
+}
\ No newline at end of file
diff --git a/benchmarks/v2/results/smoke/runs.jsonl b/benchmarks/v2/results/smoke/runs.jsonl
new file mode 100644
index 0000000..6b2ea4b
--- /dev/null
+++ b/benchmarks/v2/results/smoke/runs.jsonl
@@ -0,0 +1,7 @@
+{"type":"sweep","config":{"id":"smoke","providers":[{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"}],"promptVariants":["spec"],"encodingVariants":["indented-text"],"optimizationVariants":["off","combined"],"protocols":["slop"],"apps":["issue-tracker"],"dataScales":["s"],"scenarioFilter":["explore-and-act"],"seeds":[42],"iterations":3,"maxConcurrency":1,"maxTurns":20,"temperature":0},"startedAt":"2026-04-15T11:55:46.162Z"}
+{"sweepId":"smoke","cellId":"785d02dabebe52d5","runId":"smoke:785d02dabebe52d5:0","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T11:55:46.162Z","durationMs":60362.468917,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":432,"totalTokens":10035,"maxContextTokens":4895,"timeToFirstToolCallMs":42227.175,"setupTimeMs":3.0109580000000022,"llmTimeMs":60344.262625,"totalTimeMs":60361.526959,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":271,"latencyMs":42227.166667000005,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":161,"latencyMs":18117.095957999998,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
+{"sweepId":"smoke","cellId":"785d02dabebe52d5","runId":"smoke:785d02dabebe52d5:1","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":1},"startedAt":"2026-04-15T11:56:46.525Z","durationMs":58663.499667000004,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":531,"totalTokens":10134,"maxContextTokens":4895,"timeToFirstToolCallMs":40849.855666999996,"setupTimeMs":2.284749999998894,"llmTimeMs":58648.274249999995,"totalTimeMs":58663.363333999994,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":372,"latencyMs":40849.844791999996,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":159,"latencyMs":17798.429458,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
+{"sweepId":"smoke","cellId":"785d02dabebe52d5","runId":"smoke:785d02dabebe52d5:2","configHash":"785d02dabebe52d5","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"off","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":2},"startedAt":"2026-04-15T11:57:45.189Z","durationMs":58788.669958,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":9603,"outputTokens":533,"totalTokens":10136,"maxContextTokens":4895,"timeToFirstToolCallMs":40758.65095900002,"setupTimeMs":2.3049170000012964,"llmTimeMs":58775.35249999999,"totalTimeMs":58788.625958000004,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":4708,"outputTokens":372,"latencyMs":40758.644125000006,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4895,"outputTokens":161,"latencyMs":18016.708374999987,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
+{"sweepId":"smoke","cellId":"fbc384d4e3ccd4ae","runId":"smoke:fbc384d4e3ccd4ae:0","configHash":"fbc384d4e3ccd4ae","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"combined","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":0},"startedAt":"2026-04-15T11:58:43.978Z","durationMs":58529.54658299999,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8115,"outputTokens":497,"totalTokens":8612,"maxContextTokens":4151,"timeToFirstToolCallMs":35525.243791000015,"setupTimeMs":1.9764159999904223,"llmTimeMs":58514.63970900001,"totalTimeMs":58529.49812500001,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3964,"outputTokens":288,"latencyMs":35525.239417000004,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4151,"outputTokens":209,"latencyMs":22989.400292000006,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
+{"sweepId":"smoke","cellId":"fbc384d4e3ccd4ae","runId":"smoke:fbc384d4e3ccd4ae:1","configHash":"fbc384d4e3ccd4ae","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"combined","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":1},"startedAt":"2026-04-15T11:59:42.508Z","durationMs":54111.23016699997,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8115,"outputTokens":497,"totalTokens":8612,"maxContextTokens":4151,"timeToFirstToolCallMs":31149.366374999983,"setupTimeMs":2.1295000000100117,"llmTimeMs":54098.86620800002,"totalTimeMs":54111.153208,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3964,"outputTokens":288,"latencyMs":31149.329875000025,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4151,"outputTokens":209,"latencyMs":22949.536332999996,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
+{"sweepId":"smoke","cellId":"fbc384d4e3ccd4ae","runId":"smoke:fbc384d4e3ccd4ae:2","configHash":"fbc384d4e3ccd4ae","cell":{"provider":{"kind":"openai-compat","baseUrl":"http://slopinator-s-1.local:11434/v1","model":"gemma4:31b"},"prompt":"spec","encoding":"indented-text","optimization":"combined","protocol":"slop","app":"issue-tracker","scale":"s","scenario":"explore-and-act","seed":42,"iteration":2},"startedAt":"2026-04-15T12:00:36.619Z","durationMs":54053.455665999965,"metrics":{"turns":2,"toolCalls":2,"navigationToolCalls":0,"affordanceToolCalls":2,"unknownToolCalls":0,"invokeErrorCalls":0,"paramErrorCalls":0,"inputTokens":8115,"outputTokens":497,"totalTokens":8612,"maxContextTokens":4151,"timeToFirstToolCallMs":31130.699250000005,"setupTimeMs":2.4627910000272095,"llmTimeMs":54040.778249999974,"totalTimeMs":54053.373291000025,"transportBytesSent":375,"transportBytesReceived":350,"specComplianceRate":1,"finishReason":"done","turnBreakdown":[{"index":0,"inputTokens":3964,"outputTokens":288,"latencyMs":31130.65529199998,"toolCalls":2,"toolCallKinds":["affordance","affordance"]},{"index":1,"inputTokens":4151,"outputTokens":209,"latencyMs":22910.122957999993,"toolCalls":0,"toolCallKinds":[]}],"verification":{"passed":true,"totalChecks":3,"passedChecks":3,"failures":[]}}}
diff --git a/benchmarks/v2/run.ts b/benchmarks/v2/run.ts
new file mode 100644
index 0000000..2653509
--- /dev/null
+++ b/benchmarks/v2/run.ts
@@ -0,0 +1,38 @@
+import { parseArgs } from "node:util";
+import { runSweep } from "./runner/sweep.ts";
+import type { SweepConfig } from "./runner/types.ts";
+
+const { values } = parseArgs({
+  options: {
+    config: { type: "string", default: "smoke" },
+    "dry-run": { type: "boolean", default: false },
+    fresh: { type: "boolean", default: false },
+    id: { type: "string" },
+  },
+});
+
+const configName = values.config!;
+const mod = await import(`./config/${configName}.ts`);
+const camel = configName.replace(/-([a-z])/g, (_, c: string) => c.toUpperCase());
+const sweepName = `${camel}Sweep`;
+const sweep: SweepConfig | undefined = mod[sweepName] ?? mod.default;
+if (!sweep) {
+  console.error(`config/${configName}.ts must export \`${sweepName}\` or a default SweepConfig`);
+  process.exit(1);
+}
+
+if (values.id) sweep.id = values.id;
+
+console.log(`[run] sweep=${sweep.id} config=${configName}`);
+console.log(
+  `[run] providers=${sweep.providers.map((p) => `${p.kind}:${p.model}`).join(",")} ` +
+    `prompts=${sweep.promptVariants.join(",")} ` +
+    `encodings=${sweep.encodingVariants.join(",")} ` +
+    `optimizations=${sweep.optimizationVariants.join(",")} ` +
+    `protocols=${sweep.protocols.join(",")} ` +
+    `apps=${sweep.apps.join(",")} ` +
+    `scales=${sweep.dataScales.join(",")} ` +
+    `iterations=${sweep.iterations}`,
+);
+
+await runSweep(sweep, { dryRun: values["dry-run"], fresh: values.fresh });
diff --git a/benchmarks/v2/runner/hash.ts b/benchmarks/v2/runner/hash.ts
new file mode 100644
index 0000000..3d3f49c
--- /dev/null
+++ b/benchmarks/v2/runner/hash.ts
@@ -0,0 +1,66 @@
+import { createHash } from "node:crypto";
+import type { Cell, SweepConfig } from "./types.ts";
+
+/**
+ * Canonicalize a value into a deterministic JSON string: object keys sorted,
+ * arrays preserved in order, primitives as-is. Two cells that should hash to
+ * the same value must stringify identically.
+ */
+export function canonicalize(value: unknown): string {
+  if (value === null || value === undefined) return "null";
+  if (typeof value !== "object") return JSON.stringify(value);
+  if (Array.isArray(value)) {
+    return `[${value.map(canonicalize).join(",")}]`;
+  }
+  const obj = value as Record<string, unknown>;
+  const keys = Object.keys(obj).sort();
+  return `{${keys.map((k) => `${JSON.stringify(k)}:${canonicalize(obj[k])}`).join(",")}}`;
+}
+
+function sha256Hex(s: string): string {
+  return createHash("sha256").update(s).digest("hex");
+}
+
+/**
+ * Deterministic ID for a cell, independent of iteration index. Two cells with
+ * the same configHash should produce identical runs (given a fixed seed).
+ */
+export function configHash(sweep: SweepConfig, cell: Cell): string {
+  const snapshot = {
+    sweep: {
+      maxTurns: sweep.maxTurns,
+      temperature: sweep.temperature,
+    },
+    cell: {
+      provider: cell.provider,
+      prompt: cell.prompt,
+      encoding: cell.encoding,
+      optimization: cell.optimization,
+      protocol: cell.protocol,
+      mcpVariant: cell.mcpVariant ?? null,
+      app: cell.app,
+      scale: cell.scale,
+      scenario: cell.scenario,
+      seed: cell.seed,
+    },
+  };
+  return sha256Hex(canonicalize(snapshot)).slice(0, 16);
+}
+
+export function cellLabel(cell: Cell): string {
+  const parts = [
+    cell.app,
+    cell.scale,
+    cell.scenario,
+    cell.protocol,
+    cell.protocol === "mcp" ? (cell.mcpVariant ?? "flat") : `${cell.prompt}/${cell.encoding}/${cell.optimization}`,
+    `${cell.provider.kind}:${cell.provider.model}`,
+    `seed=${cell.seed}`,
+    `iter=${cell.iteration}`,
+  ];
+  return parts.join(" | ");
+}
+
+export function runId(sweepId: string, cell: Cell, cfgHash: string): string {
+  return `${sweepId}:${cfgHash}:${cell.iteration}`;
+}
diff --git a/benchmarks/v2/runner/jsonl.ts b/benchmarks/v2/runner/jsonl.ts
new file mode 100644
index 0000000..6ff7626
--- /dev/null
+++ b/benchmarks/v2/runner/jsonl.ts
@@ -0,0 +1,32 @@
+import { mkdirSync, createWriteStream, type WriteStream } from "node:fs";
+import { dirname } from "node:path";
+
+export interface JsonlWriterOpts {
+  append?: boolean;
+}
+
+export class JsonlWriter {
+  private stream: WriteStream | null = null;
+
+  constructor(private readonly path: string, private readonly opts: JsonlWriterOpts = {}) {}
+
+  open() {
+    mkdirSync(dirname(this.path), { recursive: true });
+    // Default: truncate on open so re-running a sweep with the same id starts
+    // fresh. Pass {append: true} to accumulate across runs.
+    this.stream = createWriteStream(this.path, { flags: this.opts.append ? "a" : "w" });
+  }
+
+  write(record: unknown) {
+    if (!this.stream) throw new Error("JsonlWriter not opened");
+    this.stream.write(`${JSON.stringify(record)}\n`);
+  }
+
+  async close() {
+    if (!this.stream) return;
+    await new Promise<void>((resolve, reject) => {
+      this.stream!.end((err?: Error | null) => (err ? reject(err) : resolve()));
+    });
+    this.stream = null;
+  }
+}
diff --git a/benchmarks/v2/runner/mcp-cell.ts b/benchmarks/v2/runner/mcp-cell.ts
new file mode 100644
index 0000000..269de39
--- /dev/null
+++ b/benchmarks/v2/runner/mcp-cell.ts
@@ -0,0 +1,196 @@
+import { resolveApp } from "../apps/registry.ts";
+import { resolveMcpVariant } from "../variants/mcp-variants.ts";
+import type { LlmProvider, ChatMessage, ToolDef } from "../providers/types.ts";
+import type { Cell, CellMetrics, SweepConfig, TurnMetric } from "./types.ts";
+
+interface RunMcpCellArgs {
+  cell: Cell;
+  sweep: SweepConfig;
+  provider: LlmProvider;
+}
+
+export async function runMcpCell({ cell, sweep, provider }: RunMcpCellArgs): Promise<CellMetrics> {
+  const app = resolveApp(cell.app);
+  if (!app.startMcpServer || !app.mcpSystemPrompt) {
+    throw new Error(`App ${cell.app} does not expose an MCP server`);
+  }
+  const variant = cell.mcpVariant ?? "flat";
+  const scenario = app.scenarios.find((s) => s.name === cell.scenario);
+  if (!scenario) throw new Error(`Scenario "${cell.scenario}" not found on app ${cell.app}`);
+
+  const t0 = performance.now();
+
+  let transportBytesSent = 0;
+  let transportBytesReceived = 0;
+  let inputTokens = 0;
+  let outputTokens = 0;
+  let maxContextTokens = 0;
+  let turns = 0;
+  let totalToolCalls = 0;
+  let affordanceToolCalls = 0;
+  let unknownToolCalls = 0;
+  let invokeErrorCalls = 0;
+  let paramErrorCalls = 0;
+  let llmTimeMs = 0;
+  let setupTimeMs = 0;
+  let timeToFirstToolCallMs: number | null = null;
+  let finishReason: CellMetrics["finishReason"] = "done";
+  const turnBreakdown: TurnMetric[] = [];
+
+  const tSetup = performance.now();
+  const handle = await app.startMcpServer(cell.scale, variant);
+  let verification: Awaited<ReturnType<NonNullable<typeof handle.verify>>> | undefined;
+  try {
+    const listed = await handle.client.listTools();
+    setupTimeMs = performance.now() - tSetup;
+
+    const mcpToolNames = new Set(listed.tools.map((t) => t.name));
+    const tools: ToolDef[] = listed.tools.map((t) => ({
+      name: t.name,
+      description: t.description ?? "",
+      parameters: (t.inputSchema as Record<string, unknown>) ?? { type: "object", properties: {} },
+    }));
+
+    const buildMcpPrompt = resolveMcpVariant(variant);
+    const systemPrompt = buildMcpPrompt(app.mcpSystemPrompt);
+
+    const history: ChatMessage[] = [{ role: "user", content: scenario.agentPrompt }];
+    const tAgentStart = performance.now();
+
+    while (turns < sweep.maxTurns) {
+      turns += 1;
+      const turnIndex = turns - 1;
+      const tGen = performance.now();
+      const res = await provider.generate({
+        systemPrompt,
+        messages: history,
+        tools,
+        temperature: sweep.temperature,
+      });
+      const turnLatency = performance.now() - tGen;
+      llmTimeMs += turnLatency;
+      inputTokens += res.usage.inputTokens;
+      outputTokens += res.usage.outputTokens;
+      if (res.usage.inputTokens > maxContextTokens) maxContextTokens = res.usage.inputTokens;
+
+      history.push(res.message);
+      const calls = res.message.toolCalls ?? [];
+      const turn: TurnMetric = {
+        index: turnIndex,
+        inputTokens: res.usage.inputTokens,
+        outputTokens: res.usage.outputTokens,
+        latencyMs: turnLatency,
+        toolCalls: calls.length,
+        toolCallKinds: [],
+      };
+
+      if (calls.length === 0) {
+        turnBreakdown.push(turn);
+        finishReason = "done";
+        break;
+      }
+
+      if (timeToFirstToolCallMs === null) timeToFirstToolCallMs = performance.now() - tAgentStart;
+
+      for (const call of calls) {
+        totalToolCalls += 1;
+        if (!mcpToolNames.has(call.name)) {
+          unknownToolCalls += 1;
+          turn.toolCallKinds.push("unknown");
+          history.push({
+            role: "tool",
+            content: JSON.stringify({ error: `unknown tool: ${call.name}` }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+          continue;
+        }
+        try {
+          const result = await handle.client.callTool({ name: call.name, arguments: call.arguments });
+          const sent = JSON.stringify({ name: call.name, arguments: call.arguments }).length;
+          const content = (result as { content?: Array<{ type: string; text?: string }>; isError?: boolean }).content ?? [];
+          const resultText = content
+            .filter((c) => c.type === "text")
+            .map((c) => c.text ?? "")
+            .join("");
+          const isError = (result as { isError?: boolean }).isError === true;
+          if (isError) {
+            invokeErrorCalls += 1;
+            turn.toolCallKinds.push("invoke_error");
+          } else {
+            affordanceToolCalls += 1;
+            turn.toolCallKinds.push("affordance");
+          }
+          if (process.env.BENCH_DEBUG) {
+            console.error(
+              `[mcp-cell] ${isError ? "ERR " : ""}${call.name}(${JSON.stringify(call.arguments).slice(0, 200)}) → ${resultText.slice(0, 200)}`,
+            );
+          }
+          transportBytesSent += sent;
+          transportBytesReceived += resultText.length;
+          history.push({
+            role: "tool",
+            content: resultText || JSON.stringify({ status: "ok" }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+        } catch (err) {
+          invokeErrorCalls += 1;
+          turn.toolCallKinds.push("invoke_error");
+          history.push({
+            role: "tool",
+            content: JSON.stringify({ error: `invoke failed: ${err instanceof Error ? err.message : String(err)}` }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+        }
+      }
+
+      turnBreakdown.push(turn);
+    }
+
+    if (turns >= sweep.maxTurns && (history[history.length - 1]?.toolCalls?.length ?? 0) > 0) {
+      finishReason = "max_turns";
+    }
+
+    // Verification must run while the MCP server is still alive (it rebuilds state via tool calls).
+    verification = await handle.verify(scenario);
+  } finally {
+    await handle.stop();
+  }
+
+  const totalTimeMs = performance.now() - t0;
+  const attemptedCalls = affordanceToolCalls + unknownToolCalls + paramErrorCalls + invokeErrorCalls;
+  const specComplianceRate = attemptedCalls > 0 ? affordanceToolCalls / attemptedCalls : 1;
+
+  return {
+    turns,
+    toolCalls: totalToolCalls,
+    navigationToolCalls: 0,
+    affordanceToolCalls,
+    unknownToolCalls,
+    invokeErrorCalls,
+    paramErrorCalls,
+    inputTokens,
+    outputTokens,
+    totalTokens: inputTokens + outputTokens,
+    maxContextTokens,
+    timeToFirstToolCallMs,
+    setupTimeMs,
+    llmTimeMs,
+    totalTimeMs,
+    transportBytesSent,
+    transportBytesReceived,
+    specComplianceRate,
+    finishReason,
+    turnBreakdown,
+    verification: verification
+      ? {
+          passed: verification.passed,
+          totalChecks: verification.checks.length,
+          passedChecks: verification.checks.filter((c) => c.passed).length,
+          failures: verification.checks.filter((c) => !c.passed).map((c) => `${c.name}${c.detail ? `: ${c.detail}` : ""}`),
+        }
+      : undefined,
+  };
+}
diff --git a/benchmarks/v2/runner/slop-cell.ts b/benchmarks/v2/runner/slop-cell.ts
new file mode 100644
index 0000000..36bd1fa
--- /dev/null
+++ b/benchmarks/v2/runner/slop-cell.ts
@@ -0,0 +1,299 @@
+import { SlopConsumer, WebSocketClientTransport, affordancesToTools } from "@slop-ai/consumer";
+import type { SlopNode } from "@slop-ai/consumer";
+import { resolveApp } from "../apps/registry.ts";
+import { resolveEncoding } from "../variants/encodings.ts";
+import { resolveOptimization } from "../variants/optimizations.ts";
+import { resolvePrompt } from "../variants/prompts.ts";
+import type { LlmProvider, ChatMessage, ToolDef } from "../providers/types.ts";
+import type { Cell, CellMetrics, SweepConfig, TurnMetric } from "./types.ts";
+
+interface RunSlopCellArgs {
+  cell: Cell;
+  sweep: SweepConfig;
+  provider: LlmProvider;
+  port: number;
+}
+
+export async function runSlopCell({ cell, sweep, provider, port }: RunSlopCellArgs): Promise<CellMetrics> {
+  const app = resolveApp(cell.app);
+  const optimization = resolveOptimization(cell.optimization);
+  const encode = resolveEncoding(cell.encoding);
+  const buildPrompt = resolvePrompt(cell.prompt);
+
+  const scenario = app.scenarios.find((s) => s.name === cell.scenario);
+  if (!scenario) throw new Error(`Scenario "${cell.scenario}" not found on app ${cell.app}`);
+
+  const t0 = performance.now();
+  const store = app.createStore(cell.scale, cell.seed);
+  const server = await app.startSlopServer(store, port, optimization.serverOpts);
+
+  let transportBytesSent = 0;
+  let transportBytesReceived = 0;
+  let inputTokens = 0;
+  let outputTokens = 0;
+  let maxContextTokens = 0;
+  let turns = 0;
+  let totalToolCalls = 0;
+  let navigationToolCalls = 0;
+  let affordanceToolCalls = 0;
+  let unknownToolCalls = 0;
+  let invokeErrorCalls = 0;
+  let paramErrorCalls = 0;
+  let llmTimeMs = 0;
+  let setupTimeMs = 0;
+  let timeToFirstToolCallMs: number | null = null;
+  let finishReason: CellMetrics["finishReason"] = "done";
+  const turnBreakdown: TurnMetric[] = [];
+
+  try {
+    const tSetup = performance.now();
+    const transport = new WebSocketClientTransport(server.wsUrl);
+    const consumer = new SlopConsumer(transport);
+    await consumer.connect();
+    const { id: subId, snapshot } = await consumer.subscribe("/", -1);
+    setupTimeMs = performance.now() - tSetup;
+
+    let toolSet = affordancesToTools(snapshot);
+    const initialStateText = encode(snapshot);
+    const systemPrompt = buildPrompt(initialStateText);
+
+    const navigationTools: ToolDef[] = [
+      {
+        name: "slop_query",
+        description:
+          "Load the full subtree at a given path. Use this to expand windowed collections, load lazy children, or resolve stub nodes. Returns the subtree with all properties, children, and affordances.",
+        parameters: {
+          type: "object",
+          properties: {
+            path: { type: "string", description: "Tree path to load" },
+            depth: { type: "integer", description: "Resolution depth; -1 for full. Default: -1" },
+          },
+          required: ["path"],
+        },
+      },
+      {
+        name: "slop_get_state",
+        description: "Return the current full state tree.",
+        parameters: { type: "object", properties: {} },
+      },
+    ];
+
+    const buildTools = (): ToolDef[] => [
+      ...toolSet.tools.map((t) => ({
+        name: t.function.name,
+        description: t.function.description,
+        parameters: t.function.parameters as Record<string, unknown>,
+      })),
+      ...navigationTools,
+    ];
+
+    const history: ChatMessage[] = [{ role: "user", content: scenario.agentPrompt }];
+    const tAgentStart = performance.now();
+
+    while (turns < sweep.maxTurns) {
+      turns += 1;
+      const turnIndex = turns - 1;
+      const tGen = performance.now();
+      const res = await provider.generate({
+        systemPrompt,
+        messages: history,
+        tools: buildTools(),
+        temperature: sweep.temperature,
+      });
+      const turnLatency = performance.now() - tGen;
+      llmTimeMs += turnLatency;
+      inputTokens += res.usage.inputTokens;
+      outputTokens += res.usage.outputTokens;
+      if (res.usage.inputTokens > maxContextTokens) maxContextTokens = res.usage.inputTokens;
+
+      history.push(res.message);
+      const calls = res.message.toolCalls ?? [];
+      const turn: TurnMetric = {
+        index: turnIndex,
+        inputTokens: res.usage.inputTokens,
+        outputTokens: res.usage.outputTokens,
+        latencyMs: turnLatency,
+        toolCalls: calls.length,
+        toolCallKinds: [],
+      };
+
+      if (calls.length === 0) {
+        turnBreakdown.push(turn);
+        finishReason = "done";
+        break;
+      }
+
+      if (timeToFirstToolCallMs === null) timeToFirstToolCallMs = performance.now() - tAgentStart;
+
+      let treeChanged = false;
+
+      for (const call of calls) {
+        totalToolCalls += 1;
+
+        if (call.name === "slop_query") {
+          navigationToolCalls += 1;
+          turn.toolCallKinds.push("slop_query");
+          const path = String(call.arguments.path ?? "/");
+          const depth = Number.isFinite(call.arguments.depth) ? Number(call.arguments.depth) : -1;
+          const subtree = await consumer.query(path, depth);
+          transportBytesSent += JSON.stringify({ type: "query", path, depth }).length;
+          transportBytesReceived += JSON.stringify(subtree).length;
+          const subtreeText = encode(subtree as SlopNode);
+          history.push({
+            role: "tool",
+            content: JSON.stringify({ path, tree: subtreeText }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+          mergeDiscoveredAffordances(toolSet, subtree as SlopNode, path);
+          treeChanged = true;
+          continue;
+        }
+
+        if (call.name === "slop_get_state") {
+          navigationToolCalls += 1;
+          turn.toolCallKinds.push("slop_get_state");
+          const currentTree = consumer.getTree(subId);
+          const text = currentTree ? encode(currentTree) : "No state available";
+          transportBytesReceived += text.length;
+          history.push({
+            role: "tool",
+            content: JSON.stringify({ tree: text }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+          continue;
+        }
+
+        const resolved = toolSet.resolve(call.name);
+        if (!resolved) {
+          unknownToolCalls += 1;
+          turn.toolCallKinds.push("unknown");
+          history.push({
+            role: "tool",
+            content: JSON.stringify({ error: `unknown tool: ${call.name}` }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+          continue;
+        }
+
+        const invokePath = resolvePath(resolved, call.arguments);
+        if (!invokePath) {
+          paramErrorCalls += 1;
+          turn.toolCallKinds.push("param_error");
+          history.push({
+            role: "tool",
+            content: JSON.stringify({ error: "missing target for grouped affordance" }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+          continue;
+        }
+
+        try {
+          const result = await consumer.invoke(invokePath, resolved.action, call.arguments);
+          affordanceToolCalls += 1;
+          turn.toolCallKinds.push("affordance");
+          transportBytesSent += JSON.stringify({ path: invokePath, action: resolved.action, params: call.arguments }).length;
+          transportBytesReceived += JSON.stringify(result).length;
+          history.push({
+            role: "tool",
+            content: JSON.stringify(result.data ?? { status: result.status }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+          treeChanged = true;
+        } catch (err) {
+          invokeErrorCalls += 1;
+          turn.toolCallKinds.push("invoke_error");
+          history.push({
+            role: "tool",
+            content: JSON.stringify({ error: `invoke failed: ${err instanceof Error ? err.message : String(err)}` }),
+            toolCallId: call.id,
+            name: call.name,
+          });
+        }
+      }
+
+      turnBreakdown.push(turn);
+
+      if (treeChanged) {
+        const updated = consumer.getTree(subId);
+        if (updated) toolSet = affordancesToTools(updated);
+      }
+    }
+
+    if (turns >= sweep.maxTurns && (history[history.length - 1]?.toolCalls?.length ?? 0) > 0) {
+      finishReason = "max_turns";
+    }
+
+    consumer.disconnect();
+  } finally {
+    await server.stop();
+  }
+
+  const verification = app.verify(store, scenario);
+  const totalTimeMs = performance.now() - t0;
+  const attemptedCalls = affordanceToolCalls + unknownToolCalls + paramErrorCalls + invokeErrorCalls;
+  const specComplianceRate = attemptedCalls > 0 ? affordanceToolCalls / attemptedCalls : 1;
+
+  return {
+    turns,
+    toolCalls: totalToolCalls,
+    navigationToolCalls,
+    affordanceToolCalls,
+    unknownToolCalls,
+    invokeErrorCalls,
+    paramErrorCalls,
+    inputTokens,
+    outputTokens,
+    totalTokens: inputTokens + outputTokens,
+    maxContextTokens,
+    timeToFirstToolCallMs,
+    setupTimeMs,
+    llmTimeMs,
+    totalTimeMs,
+    transportBytesSent,
+    transportBytesReceived,
+    specComplianceRate,
+    finishReason,
+    turnBreakdown,
+    verification: verification
+      ? {
+          passed: verification.passed,
+          totalChecks: verification.checks.length,
+          passedChecks: verification.checks.filter((c) => c.passed).length,
+          failures: verification.checks.filter((c) => !c.passed).map((c) => `${c.name}${c.detail ? `: ${c.detail}` : ""}`),
+        }
+      : undefined,
+  };
+}
+
+function resolvePath(
+  resolved: { path: string | null; action: string; targets?: string[] },
+  args: Record<string, unknown>,
+): string | null {
+  if (resolved.path) return resolved.path;
+  const target = args.target;
+  if (typeof target === "string" && resolved.targets && resolved.targets.includes(target)) {
+    return target;
+  }
+  return null;
+}
+
+function mergeDiscoveredAffordances(
+  existing: ReturnType<typeof affordancesToTools>,
+  subtree: SlopNode,
+  subtreePath: string,
+) {
+  const subtreeTools = affordancesToTools(subtree, subtreePath);
+  const existingResolve = existing.resolve.bind(existing);
+  const subtreeResolve = subtreeTools.resolve.bind(subtreeTools);
+  for (const tool of subtreeTools.tools) {
+    if (!existing.tools.find((t) => t.function.name === tool.function.name)) {
+      existing.tools.push(tool);
+    }
+  }
+  existing.resolve = (name: string) => existingResolve(name) ?? subtreeResolve(name);
+}
diff --git a/benchmarks/v2/runner/sweep.ts b/benchmarks/v2/runner/sweep.ts
new file mode 100644
index 0000000..d770c06
--- /dev/null
+++ b/benchmarks/v2/runner/sweep.ts
@@ -0,0 +1,231 @@
+import { existsSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import type { LlmProvider } from "../providers/types.ts";
+import { OpenAICompatProvider } from "../providers/openai-compat.ts";
+import { resolveApp } from "../apps/registry.ts";
+import { runSlopCell } from "./slop-cell.ts";
+import { runMcpCell } from "./mcp-cell.ts";
+import { cellLabel, configHash, runId } from "./hash.ts";
+import { JsonlWriter } from "./jsonl.ts";
+import type { Cell, ProviderConfig, RunRecord, SweepConfig } from "./types.ts";
+
+const BASE_PORT = 4198;
+
+export interface SweepRunOptions {
+  resultsRoot?: string;
+  dryRun?: boolean;
+  /** Truncate existing runs.jsonl and start over. Default: resume if data exists. */
+  fresh?: boolean;
+  onRecord?: (record: RunRecord) => void;
+}
+
+export async function runSweep(sweep: SweepConfig, opts: SweepRunOptions = {}) {
+  const resultsRoot = opts.resultsRoot ?? join(import.meta.dir, "..", "results");
+  const outDir = join(resultsRoot, sweep.id);
+  const jsonlPath = join(outDir, "runs.jsonl");
+
+  // Resume: scan existing runs.jsonl (if any) and collect completed run IDs.
+  // A cell is considered "done" when a record with its runId and no `error`
+  // field is present. Errored cells are retried on resume.
+  const completedRunIds = new Set<string>();
+  let appending = false;
+  if (!opts.fresh && existsSync(jsonlPath)) {
+    try {
+      const raw = readFileSync(jsonlPath, "utf8");
+      for (const line of raw.split("\n")) {
+        if (!line.trim()) continue;
+        try {
+          const rec = JSON.parse(line) as Partial<RunRecord> & { runId?: string; error?: string };
+          if (rec.runId && !rec.error && rec.metrics) completedRunIds.add(rec.runId);
+        } catch {
+          // Ignore malformed lines
+        }
+      }
+      if (completedRunIds.size > 0) {
+        appending = true;
+        console.log(`[sweep] resume: ${completedRunIds.size} completed runs already recorded, appending`);
+      }
+    } catch (err) {
+      console.warn(`[sweep] resume: failed to read ${jsonlPath}: ${err}`);
+    }
+  }
+
+  const writer = new JsonlWriter(jsonlPath, { append: appending });
+  writer.open();
+  if (!appending) {
+    writer.write({ type: "sweep", config: sweep, startedAt: new Date().toISOString() });
+  }
+
+  const cells = expand(sweep);
+  console.log(`[sweep] ${sweep.id}: ${cells.length} cells`);
+
+  if (opts.dryRun) {
+    for (const cell of cells) {
+      const h = configHash(sweep, cell);
+      console.log(`[dry] ${h} ${cellLabel(cell)}`);
+    }
+    await writer.close();
+    return { cells, recorded: 0 };
+  }
+
+  const providerCache = new Map<string, LlmProvider>();
+  let done = 0;
+  let skipped = 0;
+  let portCursor = BASE_PORT;
+
+  for (const cell of cells) {
+    const h = configHash(sweep, cell);
+    const id = runId(sweep.id, cell, h);
+    if (completedRunIds.has(id)) {
+      skipped += 1;
+      done += 1;
+      console.log(`[${done}/${cells.length}] SKIP ${h} ${cellLabel(cell)}`);
+      continue;
+    }
+    const startedAt = new Date().toISOString();
+    const t0 = performance.now();
+
+    let record: RunRecord = {
+      sweepId: sweep.id,
+      cellId: h,
+      runId: id,
+      configHash: h,
+      cell,
+      startedAt,
+      durationMs: 0,
+    };
+
+    try {
+      const provider = getOrCreateProvider(providerCache, cell.provider);
+      const port = portCursor++;
+
+      if (cell.protocol === "slop") {
+        const metrics = await runSlopCell({ cell, sweep, provider, port });
+        record.metrics = metrics;
+      } else if (cell.protocol === "mcp") {
+        const metrics = await runMcpCell({ cell, sweep, provider });
+        record.metrics = metrics;
+      } else {
+        throw new Error(`Unknown protocol: ${cell.protocol}`);
+      }
+    } catch (err) {
+      record.error = err instanceof Error ? `${err.message}\n${err.stack ?? ""}` : String(err);
+    } finally {
+      record.durationMs = performance.now() - t0;
+    }
+
+    writer.write(record);
+    opts.onRecord?.(record);
+    done += 1;
+
+    const status = record.error
+      ? "ERR"
+      : record.metrics?.verification
+        ? record.metrics.verification.passed
+          ? "PASS"
+          : "FAIL"
+        : "—";
+    console.log(
+      `[${done}/${cells.length}] ${status} ${h} t=${record.durationMs.toFixed(0)}ms ${cellLabel(cell)}`,
+    );
+  }
+
+  await writer.close();
+  if (skipped > 0) console.log(`[sweep] done: ${done - skipped} ran, ${skipped} resumed`);
+  return { cells, recorded: done, skipped };
+}
+
+function expand(sweep: SweepConfig): Cell[] {
+  const cells: Cell[] = [];
+  const appFilter = sweep.apps;
+  for (const appId of appFilter) {
+    const app = resolveApp(appId);
+    const scales = sweep.dataScales.filter((s) => app.supportedScales.includes(s));
+    if (scales.length === 0) {
+      console.warn(`[sweep] app ${appId}: no supported scales in ${JSON.stringify(sweep.dataScales)} (supported: ${app.supportedScales.join(", ")})`);
+      continue;
+    }
+    const scenarios = (sweep.scenarioFilter && sweep.scenarioFilter.length > 0
+      ? app.scenarios.filter((s) => sweep.scenarioFilter!.includes(s.name))
+      : app.scenarios);
+    if (scenarios.length === 0) {
+      console.warn(`[sweep] app ${appId}: no matching scenarios`);
+      continue;
+    }
+
+    for (const provider of sweep.providers) {
+      for (const scale of scales) {
+        for (const scenario of scenarios) {
+          for (const seed of sweep.seeds) {
+            for (const protocol of sweep.protocols) {
+              if (protocol === "slop") {
+                for (const prompt of sweep.promptVariants) {
+                  for (const encoding of sweep.encodingVariants) {
+                    for (const optimization of sweep.optimizationVariants) {
+                      for (let i = 0; i < sweep.iterations; i++) {
+                        cells.push({
+                          provider,
+                          prompt,
+                          encoding,
+                          optimization,
+                          protocol,
+                          app: appId,
+                          scale,
+                          scenario: scenario.name,
+                          seed,
+                          iteration: i,
+                        });
+                      }
+                    }
+                  }
+                }
+              } else if (protocol === "mcp") {
+                const variants = sweep.mcpVariants ?? ["flat"];
+                for (const mcpVariant of variants) {
+                  for (let i = 0; i < sweep.iterations; i++) {
+                    cells.push({
+                      provider,
+                      prompt: "n/a",
+                      encoding: "n/a",
+                      optimization: "n/a",
+                      protocol,
+                      mcpVariant,
+                      app: appId,
+                      scale,
+                      scenario: scenario.name,
+                      seed,
+                      iteration: i,
+                    });
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return cells;
+}
+
+function getOrCreateProvider(cache: Map<string, LlmProvider>, cfg: ProviderConfig): LlmProvider {
+  const key = `${cfg.kind}|${cfg.baseUrl ?? ""}|${cfg.model}|${cfg.id ?? ""}`;
+  const cached = cache.get(key);
+  if (cached) return cached;
+  let provider: LlmProvider;
+  switch (cfg.kind) {
+    case "openai-compat":
+      if (!cfg.baseUrl) throw new Error("openai-compat provider requires baseUrl");
+      provider = new OpenAICompatProvider({
+        baseUrl: cfg.baseUrl,
+        model: cfg.model,
+        apiKey: cfg.apiKey,
+        id: cfg.id,
+      });
+      break;
+    default:
+      throw new Error(`Provider kind not yet implemented: ${cfg.kind}`);
+  }
+  cache.set(key, provider);
+  return provider;
+}
diff --git a/benchmarks/v2/runner/types.ts b/benchmarks/v2/runner/types.ts
new file mode 100644
index 0000000..2e485c7
--- /dev/null
+++ b/benchmarks/v2/runner/types.ts
@@ -0,0 +1,99 @@
+export type AppId = "issue-tracker" | "todo" | "file-browser" | "crm";
+export type DataScale = "s" | "m" | "l" | "xl";
+export type Protocol = "slop" | "mcp";
+
+export interface ProviderConfig {
+  kind: "openai-compat" | "gemini" | "anthropic";
+  baseUrl?: string;
+  model: string;
+  apiKey?: string;
+  id?: string;
+}
+
+export interface SweepConfig {
+  id: string;
+  providers: ProviderConfig[];
+  promptVariants: string[];
+  encodingVariants: string[];
+  optimizationVariants: string[];
+  protocols: Protocol[];
+  mcpVariants?: string[];
+  apps: AppId[];
+  dataScales: DataScale[];
+  scenarioFilter?: string[];
+  seeds: number[];
+  iterations: number;
+  maxConcurrency: number;
+  maxTurns: number;
+  temperature: number;
+}
+
+export interface Cell {
+  provider: ProviderConfig;
+  prompt: string;
+  encoding: string;
+  optimization: string;
+  protocol: Protocol;
+  mcpVariant?: string;
+  app: AppId;
+  scale: DataScale;
+  scenario: string;
+  seed: number;
+  iteration: number;
+}
+
+export interface TurnMetric {
+  index: number;
+  inputTokens: number;
+  outputTokens: number;
+  latencyMs: number;
+  toolCalls: number;
+  /** Assistant tool calls classified in this turn, for taxonomy. */
+  toolCallKinds: ("slop_query" | "slop_get_state" | "affordance" | "unknown" | "param_error" | "invoke_error")[];
+}
+
+export interface CellMetrics {
+  turns: number;
+  toolCalls: number;
+  navigationToolCalls: number;
+  affordanceToolCalls: number;
+  unknownToolCalls: number;
+  /** Calls that hit the right affordance but threw during invoke. */
+  invokeErrorCalls: number;
+  /** Calls that resolved to a valid affordance but had malformed params. */
+  paramErrorCalls: number;
+  inputTokens: number;
+  outputTokens: number;
+  totalTokens: number;
+  /** Max prompt_tokens observed on any single turn — proxy for peak context pressure. */
+  maxContextTokens: number;
+  /** Wall-clock ms from user prompt send to first assistant tool call. null = never called a tool. */
+  timeToFirstToolCallMs: number | null;
+  setupTimeMs: number;
+  llmTimeMs: number;
+  totalTimeMs: number;
+  transportBytesSent: number;
+  transportBytesReceived: number;
+  /** affordanceToolCalls / (affordanceToolCalls + unknownToolCalls + paramErrorCalls). 1.0 = every tool call was a valid affordance. */
+  specComplianceRate: number;
+  finishReason: "done" | "max_turns" | "error";
+  turnBreakdown: TurnMetric[];
+  verification?: {
+    passed: boolean;
+    totalChecks: number;
+    passedChecks: number;
+    failures: string[];
+  };
+}
+
+export interface RunRecord {
+  sweepId: string;
+  cellId: string;
+  runId: string;
+  configHash: string;
+  cell: Cell;
+  metrics?: CellMetrics;
+  error?: string;
+  startedAt: string;
+  durationMs: number;
+}
diff --git a/benchmarks/v2/smoke/provider-test.ts b/benchmarks/v2/smoke/provider-test.ts
new file mode 100644
index 0000000..4ce6388
--- /dev/null
+++ b/benchmarks/v2/smoke/provider-test.ts
@@ -0,0 +1,103 @@
+import { OpenAICompatProvider } from "../providers/openai-compat.ts";
+import type { ChatMessage, ToolDef } from "../providers/types.ts";
+
+const DGX_URL = process.env.SLOP_DGX_URL ?? "http://slopinator-s-1.local:11434/v1";
+const MODEL = process.env.SLOP_SMOKE_MODEL ?? "gemma4:31b";
+
+const tools: ToolDef[] = [
+  {
+    name: "get_weather",
+    description: "Get current weather for a city",
+    parameters: {
+      type: "object",
+      properties: { city: { type: "string", description: "City name" } },
+      required: ["city"],
+    },
+  },
+  {
+    name: "answer",
+    description: "Deliver the final answer to the user once you have enough information",
+    parameters: {
+      type: "object",
+      properties: { text: { type: "string" } },
+      required: ["text"],
+    },
+  },
+];
+
+async function main() {
+  const provider = new OpenAICompatProvider({ baseUrl: DGX_URL, model: MODEL });
+  console.log(`[smoke] provider=${provider.id} url=${DGX_URL}`);
+
+  const history: ChatMessage[] = [
+    { role: "user", content: "What's the weather in Tokyo? Report in one short sentence." },
+  ];
+
+  const systemPrompt =
+    "You are an assistant that always uses tools when they can help. " +
+    "When you have a final answer, call the `answer` tool.";
+
+  let totalInput = 0;
+  let totalOutput = 0;
+  let turn = 0;
+  const MAX_TURNS = 6;
+  const t0 = performance.now();
+
+  while (turn < MAX_TURNS) {
+    turn += 1;
+    const res = await provider.generate({ systemPrompt, messages: history, tools });
+    totalInput += res.usage.inputTokens;
+    totalOutput += res.usage.outputTokens;
+
+    console.log(
+      `[turn ${turn}] finish=${res.finishReason} in=${res.usage.inputTokens} out=${res.usage.outputTokens} latency=${res.rawLatencyMs.toFixed(0)}ms`,
+    );
+
+    history.push(res.message);
+
+    if (!res.message.toolCalls || res.message.toolCalls.length === 0) {
+      console.log(`[turn ${turn}] assistant: ${res.message.content.slice(0, 200)}`);
+      break;
+    }
+
+    for (const call of res.message.toolCalls) {
+      console.log(`[turn ${turn}] tool_call ${call.name}(${JSON.stringify(call.arguments)})`);
+      if (call.name === "answer") {
+        console.log(`\nFINAL ANSWER: ${String(call.arguments.text ?? "")}`);
+        printSummary(totalInput, totalOutput, t0, turn, true);
+        return;
+      }
+      const result = dispatchTool(call.name, call.arguments);
+      history.push({
+        role: "tool",
+        content: JSON.stringify(result),
+        toolCallId: call.id,
+        name: call.name,
+      });
+    }
+  }
+
+  printSummary(totalInput, totalOutput, t0, turn, false);
+}
+
+function dispatchTool(name: string, args: Record<string, unknown>): unknown {
+  if (name === "get_weather") {
+    return { city: args.city ?? "unknown", temp_c: 18, conditions: "partly cloudy" };
+  }
+  return { error: `unknown tool: ${name}` };
+}
+
+function printSummary(inTok: number, outTok: number, t0: number, turns: number, answered: boolean) {
+  const total = performance.now() - t0;
+  console.log("\n--- smoke summary ---");
+  console.log(`turns:       ${turns}`);
+  console.log(`input tok:   ${inTok}`);
+  console.log(`output tok:  ${outTok}`);
+  console.log(`total ms:    ${total.toFixed(0)}`);
+  console.log(`answered:    ${answered}`);
+}
+
+main().catch((err) => {
+  console.error("[smoke] failed:", err);
+  process.exit(1);
+});
diff --git a/benchmarks/v2/tsconfig.json b/benchmarks/v2/tsconfig.json
new file mode 100644
index 0000000..1e4242a
--- /dev/null
+++ b/benchmarks/v2/tsconfig.json
@@ -0,0 +1,16 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "ESNext",
+    "moduleResolution": "bundler",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "allowImportingTsExtensions": true,
+    "noEmit": true,
+    "resolveJsonModule": true,
+    "types": ["bun-types"],
+    "lib": ["ES2022"]
+  },
+  "include": ["**/*.ts"]
+}
diff --git a/benchmarks/v2/variants/encodings.ts b/benchmarks/v2/variants/encodings.ts
new file mode 100644
index 0000000..587666f
--- /dev/null
+++ b/benchmarks/v2/variants/encodings.ts
@@ -0,0 +1,157 @@
+import { formatTree } from "@slop-ai/consumer";
+import type { SlopNode } from "@slop-ai/consumer";
+
+/**
+ * Tree encoders translate a SlopNode into the text string embedded in the
+ * system prompt. The encoder is orthogonal to server-side optimization —
+ * an "optimized" tree is still a SlopNode and can be projected via any
+ * encoding.
+ *
+ * Phase C ships five encodings. To compare cost vs. legibility on the
+ * same scenario, run the ablation config which sweeps all of them.
+ */
+export type TreeEncoder = (node: SlopNode) => string;
+
+const indentedText: TreeEncoder = (node) => formatTree(node);
+
+const jsonCompact: TreeEncoder = (node) => JSON.stringify(stripNode(node));
+
+const jsonPretty: TreeEncoder = (node) => JSON.stringify(stripNode(node), null, 2);
+
+const yaml: TreeEncoder = (node) => {
+  const lines: string[] = [];
+  emitYaml(stripNode(node) as Record<string, unknown>, 0, lines);
+  return lines.join("\n");
+};
+
+const markdownHeadings: TreeEncoder = (node) => {
+  const lines: string[] = [];
+  emitMarkdown(node, 0, lines, "");
+  return lines.join("\n");
+};
+
+export const ENCODING_VARIANTS: Record<string, TreeEncoder> = {
+  "indented-text": indentedText,
+  "json-compact": jsonCompact,
+  "json-pretty": jsonPretty,
+  yaml,
+  "markdown-headings": markdownHeadings,
+};
+
+export function resolveEncoding(id: string): TreeEncoder {
+  const fn = ENCODING_VARIANTS[id];
+  if (!fn) throw new Error(`Unknown encoding variant: ${id}. Available: ${Object.keys(ENCODING_VARIANTS).join(", ")}`);
+  return fn;
+}
+
+/**
+ * Strip the node tree down to a plain JSON-friendly object. We keep id, type,
+ * properties, children (recursively), affordances (as compact shapes), and
+ * meta. `content_ref` drops since benchmarks don't use large content payloads.
+ */
+function stripNode(node: SlopNode): Record<string, unknown> {
+  const out: Record<string, unknown> = {
+    id: node.id,
+    type: node.type,
+  };
+  if (node.properties && Object.keys(node.properties).length > 0) out.properties = node.properties;
+  if (node.meta && Object.keys(node.meta).length > 0) out.meta = node.meta;
+  if (node.affordances && node.affordances.length > 0) {
+    out.affordances = node.affordances.map((a) => ({
+      action: a.action,
+      ...(a.description && { description: a.description }),
+      ...(a.params && { params: a.params }),
+    }));
+  }
+  if (node.children && node.children.length > 0) out.children = node.children.map(stripNode);
+  return out;
+}
+
+function emitYaml(value: unknown, indent: number, lines: string[]): void {
+  const pad = "  ".repeat(indent);
+  if (value === null || value === undefined) {
+    lines[lines.length - 1] = `${lines[lines.length - 1]} null`;
+    return;
+  }
+  if (typeof value !== "object") {
+    lines[lines.length - 1] = `${lines[lines.length - 1]} ${yamlScalar(value)}`;
+    return;
+  }
+  if (Array.isArray(value)) {
+    if (value.length === 0) {
+      lines[lines.length - 1] = `${lines[lines.length - 1]} []`;
+      return;
+    }
+    for (const item of value) {
+      if (item !== null && typeof item === "object" && !Array.isArray(item)) {
+        const keys = Object.keys(item as Record<string, unknown>);
+        if (keys.length === 0) {
+          lines.push(`${pad}- {}`);
+          continue;
+        }
+        lines.push(`${pad}- ${keys[0]}:`);
+        emitYaml((item as Record<string, unknown>)[keys[0]], indent + 1, lines);
+        for (let i = 1; i < keys.length; i++) {
+          lines.push(`${pad}  ${keys[i]}:`);
+          emitYaml((item as Record<string, unknown>)[keys[i]], indent + 2, lines);
+        }
+      } else {
+        lines.push(`${pad}-`);
+        emitYaml(item, indent + 1, lines);
+      }
+    }
+    return;
+  }
+  const keys = Object.keys(value as Record<string, unknown>);
+  if (keys.length === 0) {
+    lines[lines.length - 1] = `${lines[lines.length - 1]} {}`;
+    return;
+  }
+  for (const key of keys) {
+    lines.push(`${pad}${key}:`);
+    emitYaml((value as Record<string, unknown>)[key], indent + 1, lines);
+  }
+}
+
+function yamlScalar(v: unknown): string {
+  if (typeof v === "string") {
+    if (/^[\w.\-/]+$/.test(v) && v !== "null" && v !== "true" && v !== "false" && v !== "") return v;
+    return JSON.stringify(v);
+  }
+  return String(v);
+}
+
+function emitMarkdown(node: SlopNode, depth: number, lines: string[], pathPrefix: string): void {
+  const heading = "#".repeat(Math.min(depth + 2, 6));
+  const path = pathPrefix === "" ? `/${node.id}` : `${pathPrefix}/${node.id}`;
+  lines.push(`${heading} \`${node.type}\` ${node.id}  \`${path}\``);
+  if (node.meta?.summary) lines.push(`> ${node.meta.summary}`);
+  if (node.properties && Object.keys(node.properties).length > 0) {
+    lines.push("");
+    for (const [k, v] of Object.entries(node.properties)) lines.push(`- **${k}**: ${formatProp(v)}`);
+  }
+  if (node.affordances && node.affordances.length > 0) {
+    lines.push("");
+    lines.push("actions:");
+    for (const a of node.affordances) {
+      const params = a.params ? Object.keys((a.params as { properties?: Record<string, unknown> }).properties ?? {}).join(", ") : "";
+      lines.push(`- \`${a.action}(${params})\`${a.description ? ` — ${a.description}` : ""}`);
+    }
+  }
+  if (node.meta && (node.meta.total_children || node.meta.window || node.meta.salience !== undefined)) {
+    const metaBits: string[] = [];
+    if (node.meta.total_children !== undefined) metaBits.push(`total_children=${node.meta.total_children}`);
+    if (node.meta.window) metaBits.push(`window=${node.meta.window.join(",")}`);
+    if (node.meta.salience !== undefined) metaBits.push(`salience=${node.meta.salience}`);
+    if (metaBits.length > 0) lines.push(`_meta: ${metaBits.join(", ")}_`);
+  }
+  lines.push("");
+  if (node.children) {
+    for (const child of node.children) emitMarkdown(child, depth + 1, lines, path);
+  }
+}
+
+function formatProp(v: unknown): string {
+  if (typeof v === "string") return v;
+  return JSON.stringify(v);
+}
diff --git a/benchmarks/v2/variants/mcp-variants.ts b/benchmarks/v2/variants/mcp-variants.ts
new file mode 100644
index 0000000..3454534
--- /dev/null
+++ b/benchmarks/v2/variants/mcp-variants.ts
@@ -0,0 +1,39 @@
+/**
+ * MCP variant registry — the "fair MCP" dimension. The cell runner consults
+ * this before falling back to the app's default mcpSystemPrompt, so adding
+ * a variant is just "add an entry, re-run the sweep."
+ *
+ * Phase C ships `flat` (current baseline, domain prompt only) and
+ * `flat+prompt` (domain prompt + extra guidance teaching the model how to
+ * behave in a flat-tool world — parity with SLOP's spec prompt). The two
+ * remaining variants from the plan (`resources`, `prompts`) need new MCP
+ * server entry points and are deferred.
+ */
+export type McpPromptBuilder = (appSystemPrompt: string) => string;
+
+const flat: McpPromptBuilder = (appPrompt) => appPrompt;
+
+const FLAT_PLUS_PROMPT_GUIDANCE = `\n
+## How to use the tools
+
+The application exposes a flat list of tools. You do NOT get a tree of state upfront — you must discover state by calling list_* and get_* tools. Guidance:
+
+1. Start by calling the broadest list_* tool to understand what entities exist. Don't call get_* for individual items when you can list them.
+2. Once you know what's out there, filter in your head — don't call a tool unless you need the result.
+3. When you mutate state (mark_*, advance_*, set_*, delete_*), assume the change took effect unless the response says otherwise. Don't re-list to verify.
+4. If a tool returns an error like "missing required fields", re-read the tool's input schema and call again with the missing parameters.
+5. Tool call budgets matter — batch what you can in one turn rather than doing one-at-a-time round trips.
+`;
+
+const flatPlusPrompt: McpPromptBuilder = (appPrompt) => appPrompt + FLAT_PLUS_PROMPT_GUIDANCE;
+
+export const MCP_VARIANTS: Record<string, McpPromptBuilder> = {
+  flat,
+  "flat+prompt": flatPlusPrompt,
+};
+
+export function resolveMcpVariant(id: string): McpPromptBuilder {
+  const fn = MCP_VARIANTS[id];
+  if (!fn) throw new Error(`Unknown mcp variant: ${id}. Available: ${Object.keys(MCP_VARIANTS).join(", ")}`);
+  return fn;
+}
diff --git a/benchmarks/v2/variants/optimizations.ts b/benchmarks/v2/variants/optimizations.ts
new file mode 100644
index 0000000..fcdc1df
--- /dev/null
+++ b/benchmarks/v2/variants/optimizations.ts
@@ -0,0 +1,33 @@
+import type { SlopServerOpts } from "../../mcp-vs-slop/app/slop-server.ts";
+
+/**
+ * An "optimization" is how the SLOP server chooses to shape the tree it emits
+ * before the encoder sees it. Today v1 collapses this into a single
+ * `optimized: boolean` server option; v2 keeps the dimension open for when
+ * server-side salience / lazy / windowing become independently toggleable.
+ */
+
+export type OptimizationVariant = {
+  id: string;
+  description: string;
+  serverOpts?: SlopServerOpts;
+};
+
+export const OPTIMIZATION_VARIANTS: Record<string, OptimizationVariant> = {
+  off: {
+    id: "off",
+    description: "No server-side optimization — full tree, every node, every child.",
+    serverOpts: undefined,
+  },
+  combined: {
+    id: "combined",
+    description: "v1 'optimized' mode — salience scoring + lazy comments + summaries.",
+    serverOpts: { optimized: true },
+  },
+};
+
+export function resolveOptimization(id: string): OptimizationVariant {
+  const v = OPTIMIZATION_VARIANTS[id];
+  if (!v) throw new Error(`Unknown optimization variant: ${id}. Available: ${Object.keys(OPTIMIZATION_VARIANTS).join(", ")}`);
+  return v;
+}
diff --git a/benchmarks/v2/variants/prompts.ts b/benchmarks/v2/variants/prompts.ts
new file mode 100644
index 0000000..bb19ac6
--- /dev/null
+++ b/benchmarks/v2/variants/prompts.ts
@@ -0,0 +1,78 @@
+import { SLOP_SYSTEM_PROMPT } from "../../mcp-vs-slop/harness/slop-system-prompt.ts";
+
+/**
+ * SLOP prompt library for the Phase C ablation. Each entry takes the
+ * app-specific state text (already rendered by the chosen encoder) and
+ * returns the full system prompt. The registry is extended by adding
+ * another entry — the cartesian sweep picks it up automatically.
+ */
+export type PromptBuilder = (stateContext: string) => string;
+
+const empty: PromptBuilder = (stateContext) => stateContext;
+
+const minimal: PromptBuilder = (stateContext) =>
+  `You are an agent. Use the available tools to complete the user's task. ` +
+  `Respond with "DONE" when finished.\n\n## Current state\n\n${stateContext}`;
+
+// v1's "basic" prompt kept for regression continuity with the old harness.
+const basic: PromptBuilder = (stateContext) =>
+  `You are an agent. Here is the current state of the application:\n\n${stateContext}\n\n` +
+  `Use the available tools to complete the task. When done, respond with "DONE".`;
+
+const spec: PromptBuilder = (stateContext) =>
+  `${SLOP_SYSTEM_PROMPT}${stateContext}\n\nComplete the task using the available tools. When done, respond with "DONE".`;
+
+// Half-length spec prompt — compressed to the essentials. Tests how much
+// of the full framing is actually doing work vs. restating what's obvious.
+const SPEC_TERSE_HEADER = `You are an agent interacting with an application via the SLOP protocol.
+
+The application exposes its state as a tree of nodes. Each node has:
+- properties (data)
+- affordances (actions currently available on this node — do not attempt actions that aren't listed)
+- meta (optional hints like salience, summary, total_children)
+
+Tools:
+- Node actions are named \`nodeId__action\` and perform the affordance.
+- \`slop_query(path)\` — load a subtree (use for lazy nodes, stubs, or windowed collections).
+- \`slop_get_state\` — read the full tree.
+
+Affordances are contextual — they may change after you act. A hidden action is an action you cannot perform right now.
+
+## Current state
+
+`;
+
+const specTerse: PromptBuilder = (stateContext) =>
+  `${SPEC_TERSE_HEADER}${stateContext}\n\nComplete the task using the available tools. When done, respond with "DONE".`;
+
+// Role-play framing — same information, but packaged as a persona. Tests
+// whether the model responds to instruction-following framing over raw
+// specification language.
+const ROLE_PLAY_HEADER = `You are a careful operations engineer working inside an application. The application shows you its current state as a tree, and the tree tells you which actions are available on which parts of the state.
+
+Your rules:
+1. Never attempt an action that isn't explicitly listed as an affordance on the node you want to act on.
+2. If you can't see the thing you need, call \`slop_query\` on the path you expect, or \`slop_get_state\` to re-read the tree.
+3. After you act, check whether the tree changed and whether the affordances you need still exist.
+
+## Current state
+
+`;
+
+const rolePlay: PromptBuilder = (stateContext) =>
+  `${ROLE_PLAY_HEADER}${stateContext}\n\nComplete the task using the available tools. When done, respond with "DONE".`;
+
+export const PROMPT_VARIANTS: Record<string, PromptBuilder> = {
+  empty,
+  minimal,
+  basic,
+  spec,
+  "spec-terse": specTerse,
+  "role-play": rolePlay,
+};
+
+export function resolvePrompt(id: string): PromptBuilder {
+  const fn = PROMPT_VARIANTS[id];
+  if (!fn) throw new Error(`Unknown prompt variant: ${id}. Available: ${Object.keys(PROMPT_VARIANTS).join(", ")}`);
+  return fn;
+}
diff --git a/bun.lock b/bun.lock
index 029f302..5b4d517 100644
--- a/bun.lock
+++ b/bun.lock
@@ -46,6 +46,18 @@
         "bun-types": "^1.3.11",
       },
     },
+    "benchmarks/v2": {
+      "name": "slop-benchmarks-v2",
+      "dependencies": {
+        "@modelcontextprotocol/sdk": "^1.29.0",
+        "@slop-ai/consumer": "workspace:*",
+        "@slop-ai/core": "workspace:*",
+        "@slop-ai/server": "workspace:*",
+      },
+      "devDependencies": {
+        "bun-types": "^1.3.11",
+      },
+    },
     "examples/cli/bun": {
       "name": "tsk",
       "version": "0.1.0",
@@ -2545,6 +2557,8 @@
 
     "slice-ansi": ["slice-ansi@5.0.0", "", { "dependencies": { "ansi-styles": "^6.0.0", "is-fullwidth-code-point": "^4.0.0" } }, "sha512-FC+lgizVPfie0kkhqUScwRu1O/lF6NOgJmlCgK+/LYxDCTk8sGelYaHDhFcDN+Sn3Cv+3VSa4Byeo+IMCzpMgQ=="],
 
+    "slop-benchmarks-v2": ["slop-benchmarks-v2@workspace:benchmarks/v2"],
+
     "slop-bridge-mcp-proxy": ["slop-bridge-mcp-proxy@workspace:packages/typescript/integrations/claude/slop-mcp-proxy/servers"],
 
     "slop-bridge-native": ["slop-bridge-native@workspace:packages/typescript/integrations/claude/slop-native/servers"],
diff --git a/package.json b/package.json
index 6f5aeff..849101c 100644
--- a/package.json
+++ b/package.json
@@ -19,7 +19,8 @@
     "website/playground",
     "apps/extension",
     "apps/desktop",
-    "benchmarks/mcp-vs-slop"
+    "benchmarks/mcp-vs-slop",
+    "benchmarks/v2"
   ],
   "scripts": {
     "build": "bun run scripts/build-typescript-packages.ts",