Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions benchmarks/v2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# SLOP Benchmarks v2 — Experiment rig (WIP)

Successor to [`benchmarks/mcp-vs-slop`](../mcp-vs-slop/). v1 stays in place as a regression anchor; v2 turns it into a proper experiment framework so we can drive SLOP v0.2 spec decisions from data.

Design spec: [fluffy-napping-walrus.md](../../.claude/plans/fluffy-napping-walrus.md) (local plan file).

## Status

- [x] Phase A — DGX inference path (OpenAI-compat provider + smoke test)
- [ ] Phase B — Sweep runner + config matrix
- [ ] Phase C — Prompt / encoding / optimization variants
- [ ] Phase C' — Fair-MCP variants
- [ ] Phase D — Metrics + statistical post-processing
- [ ] Phase E — Static dashboard
- [ ] Phase F — App complexity ladder (todo, file-browser, crm)

## DGX Spark setup

Models are served via Ollama on `slopinator-s-1.local`. The systemd unit has an override that binds Ollama to all interfaces on both address families:

```ini
# /etc/systemd/system/ollama.service.d/override.conf
[Service]
Environment=OLLAMA_HOST=[::]:11434
```

`::` binds IPv4 and IPv6 — required because Bun's fetch resolves `.local` names to IPv6 first and doesn't fall back. If the override is ever lost, Bun will report `ConnectionRefused` while curl still works; that's the tell.

## Smoke test

```bash
cd benchmarks/v2
bun run smoke/provider-test.ts
SLOP_SMOKE_MODEL=nemotron-3-super:120b bun run smoke/provider-test.ts
```

Runs a multi-turn tool-calling conversation (weather lookup → answer) against the configured model. Prints per-turn token counts, latency, and whether the model successfully delivered the final answer tool-call. Fails loudly if the OpenAI-compat endpoint misbehaves.

## Environment variables

| Var | Default | Notes |
|---|---|---|
| `SLOP_DGX_URL` | `http://slopinator-s-1.local:11434/v1` | Override to point at a different host |
| `SLOP_SMOKE_MODEL` | `gemma4:31b` | Any model in `ollama list` |

## Layout (target)

```
v2/
├── providers/ # LlmProvider interface + adapters
│ ├── types.ts
│ └── openai-compat.ts # Ollama, vLLM, OpenAI, anything /v1-compatible
├── variants/ # prompts/, encodings/, optimizations/ (Phase C)
├── mcp-variants/ # fair-MCP pass (Phase C')
├── apps/ # todo / file-browser / issue-tracker / crm (Phase F)
├── scenarios/ # shared scenario types
├── metrics/ # collectors + stats (Phase D)
├── runner/ # sweep orchestrator (Phase B)
├── dashboard/ # static HTML report (Phase E)
└── smoke/ # validation scripts
```
88 changes: 88 additions & 0 deletions benchmarks/v2/apps/crm/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import { Client } from "@modelcontextprotocol/sdk/client";
import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js";
import { CrmStore } from "./store.ts";
import { seedCrm } from "./seed.ts";
import { startCrmSlopServer, type CrmSlopOpts } from "./slop-server.ts";
import { crmScenarios } from "./scenarios.ts";
import type { AppBinding, AppStore, McpServerHandle, SlopServerHandle } from "../registry.ts";
import type { DataScale } from "../../runner/types.ts";
import type { Scenario, VerificationResult } from "../../../mcp-vs-slop/scenarios/types.ts";

function wrap(inner: CrmStore): AppStore & { inner: CrmStore } {
return { __brand: "app-store", inner } as AppStore & { inner: CrmStore };
}

export const crmApp: AppBinding = {
id: "crm",
supportedScales: ["s", "m", "l", "xl"],
createStore(scale, seed) {
const store = new CrmStore();
const { contacts, deals, activities } = seedCrm(scale, seed);
store.reset(contacts, deals, activities);
return wrap(store);
},
async startSlopServer(store, port, opts): Promise<SlopServerHandle> {
const inner = (store as unknown as { inner: CrmStore }).inner;
const { server, slop } = startCrmSlopServer(inner, port, opts as CrmSlopOpts | undefined);
return {
wsUrl: `ws://localhost:${port}/slop`,
stop: async () => {
slop.stop();
server.stop();
},
};
},
scenarios: crmScenarios,
verify(store, scenario) {
if (!scenario.verify) return undefined;
const inner = (store as unknown as { inner: CrmStore }).inner;
return scenario.verify(inner as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
},
mcpSystemPrompt:
"You are a CRM agent. You have tools to list and mutate contacts, deals, and activities. " +
"You have no prior knowledge of the data — discover it using list_* and get_* tools. " +
'When the task is complete, respond with "DONE".',
async startMcpServer(scale: DataScale, _variant: string): Promise<McpServerHandle> {
// All current MCP variants share the flat server; prompt-level variants
// are applied by the cell runner via resolveMcpVariant.
const env: Record<string, string> = { ...process.env } as Record<string, string>;
env.BENCH_SCALE = scale;
env.BENCH_SEED = String(42);
const serverPath = new URL("./mcp-server.ts", import.meta.url).pathname;
const transport = new StdioClientTransport({
command: "bun",
args: ["run", serverPath],
env,
});
const client = new Client({ name: "slop-benchmarks-v2", version: "0.2.0" });
await client.connect(transport);
return {
client,
stop: async () => {
await client.close();
},
verify: async (scenario: Scenario): Promise<VerificationResult | undefined> => {
if (!scenario.verify) return undefined;
// Reconstruct by listing all three entity collections.
const tempStore = new CrmStore();
const [cRes, dRes, aRes] = await Promise.all([
client.callTool({ name: "list_contacts", arguments: {} }),
client.callTool({ name: "list_deals", arguments: {} }),
client.callTool({ name: "list_activities", arguments: {} }),
]);
tempStore.reset(parseJson(cRes), parseJson(dRes), parseJson(aRes));
return scenario.verify(tempStore as unknown as Parameters<NonNullable<Scenario["verify"]>>[0]);
},
};
},
};

function parseJson(result: unknown): any[] {
const content = (result as { content?: Array<{ type: string; text?: string }> }).content ?? [];
const text = content.find((c) => c.type === "text")?.text ?? "[]";
try {
return JSON.parse(text) ?? [];
} catch {
return [];
}
}
120 changes: 120 additions & 0 deletions benchmarks/v2/apps/crm/mcp-server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/**
* Stdio MCP server for the crm benchmark app. Spawned as a child process by
* the MCP cell runner. Env vars:
* - BENCH_SCALE = s | m | l | xl
* - BENCH_SEED = integer
*/

import { Server } from "@modelcontextprotocol/sdk/server";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
import { CrmStore, type ActivityType, type DealStage } from "./store.ts";
import { seedCrm } from "./seed.ts";
import type { DataScale } from "../../runner/types.ts";

const scale = (process.env.BENCH_SCALE as DataScale | undefined) ?? "s";
const seed = Number(process.env.BENCH_SEED ?? 42);

const store = new CrmStore();
const { contacts, deals, activities } = seedCrm(scale, seed);
store.reset(contacts, deals, activities);

const server = new Server({ name: "crm-mcp", version: "0.2.0" }, { capabilities: { tools: {} } });

server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools: [
{ name: "list_contacts", description: "List every contact", inputSchema: { type: "object" as const, properties: {} } },
{ name: "list_deals", description: "List every deal. Optional filter by stage.", inputSchema: { type: "object" as const, properties: { stage: { type: "string", description: "lead|qualified|proposal|won|lost" } } } },
{ name: "list_activities", description: "List every activity. Optional filter by dealId or contactId.", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" }, contact_id: { type: "string" } } } },
{ name: "get_contact", description: "Get a contact by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
{ name: "get_deal", description: "Get a deal by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
{ name: "get_activity", description: "Get an activity by id", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
{ name: "deals_for_contact", description: "Return every deal belonging to a contact", inputSchema: { type: "object" as const, properties: { contact_id: { type: "string" } }, required: ["contact_id"] } },
{ name: "activities_for_deal", description: "Return every activity attached to a deal", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" } }, required: ["deal_id"] } },
{ name: "activities_for_contact", description: "Return every activity attached to a contact", inputSchema: { type: "object" as const, properties: { contact_id: { type: "string" } }, required: ["contact_id"] } },
{ name: "advance_deal_stage", description: "Set a deal's stage", inputSchema: { type: "object" as const, properties: { id: { type: "string" }, stage: { type: "string", description: "lead|qualified|proposal|won|lost" } }, required: ["id", "stage"] } },
{ name: "set_deal_value", description: "Set a deal's USD value", inputSchema: { type: "object" as const, properties: { id: { type: "string" }, value: { type: "number" } }, required: ["id", "value"] } },
{ name: "add_activity", description: "Create a new activity on a deal or contact. Provide deal_id XOR contact_id.", inputSchema: { type: "object" as const, properties: { deal_id: { type: "string" }, contact_id: { type: "string" }, type: { type: "string", description: "call|email|meeting|note" }, subject: { type: "string" }, body: { type: "string" } }, required: ["type", "subject", "body"] } },
{ name: "delete_contact", description: "Delete a contact", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
{ name: "delete_deal", description: "Delete a deal", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
{ name: "delete_activity", description: "Delete an activity", inputSchema: { type: "object" as const, properties: { id: { type: "string" } }, required: ["id"] } },
],
}));

server.setRequestHandler(CallToolRequestSchema, async (req) => {
const { name, arguments: args } = req.params;
const a = (args ?? {}) as Record<string, unknown>;
try {
switch (name) {
case "list_contacts":
return json(store.contacts);
case "list_deals": {
const stage = a.stage ? String(a.stage) : undefined;
const deals = stage ? store.deals.filter((d) => d.stage === stage) : store.deals;
return json(deals);
}
case "list_activities": {
let out = store.activities;
if (a.deal_id) out = out.filter((x) => x.dealId === a.deal_id);
if (a.contact_id) out = out.filter((x) => x.contactId === a.contact_id);
return json(out);
}
case "get_contact": return store.getContact(String(a.id)) ? json(store.getContact(String(a.id))) : err(`contact ${a.id} not found`);
case "get_deal": return store.getDeal(String(a.id)) ? json(store.getDeal(String(a.id))) : err(`deal ${a.id} not found`);
case "get_activity": return store.getActivity(String(a.id)) ? json(store.getActivity(String(a.id))) : err(`activity ${a.id} not found`);
case "deals_for_contact": return json(store.dealsForContact(String(a.contact_id)));
case "activities_for_deal": return json(store.activitiesForDeal(String(a.deal_id)));
case "activities_for_contact": return json(store.activitiesForContact(String(a.contact_id)));
case "advance_deal_stage": {
const stage = String(a.stage);
if (!["lead", "qualified", "proposal", "won", "lost"].includes(stage)) return err(`invalid stage ${stage}`);
store.advanceStage(String(a.id), stage as DealStage);
return json({ id: a.id, stage });
}
case "set_deal_value":
store.setDealValue(String(a.id), Number(a.value));
return json({ id: a.id, value: Number(a.value) });
case "add_activity": {
const missing = ["type", "subject", "body"].filter((k) => a[k] == null);
if (missing.length > 0) return err(`missing required fields: ${missing.join(", ")}`);
const type = String(a.type);
if (!["call", "email", "meeting", "note"].includes(type)) return err(`invalid type: ${type} (expected call|email|meeting|note)`);
const dealId = a.deal_id ? String(a.deal_id) : null;
const contactId = a.contact_id ? String(a.contact_id) : null;
if (dealId && contactId) return err("provide deal_id OR contact_id, not both");
if (!dealId && !contactId) return err("provide deal_id OR contact_id");
const activity = store.addActivity({
dealId,
contactId,
type: type as ActivityType,
subject: String(a.subject),
body: String(a.body),
});
return json(activity);
}
case "delete_contact":
store.deleteContact(String(a.id));
return json({ deleted: a.id });
case "delete_deal":
store.deleteDeal(String(a.id));
return json({ deleted: a.id });
case "delete_activity":
store.deleteActivity(String(a.id));
return json({ deleted: a.id });
default:
return err(`unknown tool ${name}`);
}
} catch (e) {
return err(e instanceof Error ? e.message : String(e));
}
});

function json(data: unknown) {
return { content: [{ type: "text", text: JSON.stringify(data) }] };
}
function err(msg: string) {
return { content: [{ type: "text", text: JSON.stringify({ error: msg }) }], isError: true };
}

const transport = new StdioServerTransport();
await server.connect(transport);
Loading
Loading