From dd79ce61ab506e5588a59eaad17cbe022aa8a916 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Mon, 18 May 2026 10:58:49 +1200 Subject: [PATCH 1/5] feat(executor): first-class MCP server support via executorMcpServers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a separate, parallel mechanism to executorPlugins for installing MCP servers into the executor sandbox. A Claude plugin's bundled mcpServers are not loaded by `claude --plugin-dir`, so MCP servers need their own config surface. - types: ExecutorMcpServer union (local/git/sourceless) + ResolvedExecutorMcpServer - config: validateExecutorMcpServerEntry — slug names, string command/args, optional local|git type, rejects ${MCP_ROOT} when sourceless - scaffolding: resolveExecutorMcpServers mirrors resolveExecutorPlugins - adapter/base: installMcpServersInSandbox contract; default throws - claude: uploads sourced servers to $HOME/.mcp-servers//, substitutes ${MCP_ROOT}, writes mcp-config.json, emits --mcp-config (no --strict) - codex: uploads to $CODEX_HOME/.mcp-servers//, appends [mcp_servers.*] TOML blocks to config.toml - gemini/custom: throw a clear unsupported error - execute: install block parallel to plugins, writes mcp-server-install-error.log - tests + config-schema.md docs Co-Authored-By: Claude Sonnet 4.6 --- skills/_reference/config-schema.md | 59 +++++++++++++++ src/agents/__tests__/claude.test.ts | 41 ++++++++++ src/agents/__tests__/codex.test.ts | 36 +++++++++ src/agents/adapter.ts | 12 ++- src/agents/base.ts | 19 ++++- src/agents/claude.ts | 64 +++++++++++++++- src/agents/codex.ts | 68 ++++++++++++++++- src/agents/gemini.ts | 19 ++++- src/commands/execute.ts | 15 +++- src/core/__tests__/config.test.ts | 112 ++++++++++++++++++++++++++++ src/core/config.ts | 74 ++++++++++++++++++ src/sandbox/scaffolding.ts | 43 ++++++++++- src/scoring/__tests__/judge.test.ts | 1 + src/types.ts | 73 ++++++++++++++++++ 14 files changed, 628 insertions(+), 8 deletions(-) diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 38735a0..5e218dc 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -10,6 +10,7 @@ | `targets` | `TargetConfig[]` | **Yes** | Non-empty array. Docker images for sandboxed execution. | | `workspace` | `WorkspaceConfig` | No | Workspace template and setup. | | `executorPlugins` | `ExecutorPlugin[]` | No | Plugin directories installed into the executor's agent CLI inside the sandbox (Claude marketplace, Codex skills, Gemini extensions). Not installed in the judge sandbox — that's intentional, so the judge stays independent of the executor's tooling. | +| `executorMcpServers` | `ExecutorMcpServer[]` | No | MCP servers wired into the executor's agent CLI inside the sandbox (Claude `--mcp-config`, Codex `config.toml`). A separate mechanism from `executorPlugins` — a plugin's bundled MCP servers are not loaded. Not installed in the judge sandbox. | | `sandbox` | `SandboxConfig` | **Yes** | Must be an object (can be `{}`). Resource limits, secrets, env vars. | ## SourceConfig (discriminated union on `type`) @@ -186,6 +187,63 @@ What an adapter requires inside the plugin directory: Each adapter fails fast at install time if its required file is missing — the A/B comparison won't silently no-op. +## ExecutorMcpServer (discriminated union on optional `type`) + +An MCP server wired into the executor's agent CLI. Independent of +`ExecutorPlugin` — MCP servers are wired through each CLI's native MCP-config +surface, NOT through agent plugins (a Claude plugin's `mcpServers` are not +loaded by `claude --plugin-dir`). Installed **only** in the executor sandbox; +the judge sandbox is kept MCP-free. + +Every entry shares three fields: + +| Field | Type | Required | +|-------|------|----------| +| `name` | `string` | Yes — server slug (letters/digits/`.`/`_`/`-`), unique across the array. Used as the MCP server name and install dir name. | +| `command` | `string` | Yes — executable that launches the MCP server (e.g. `node`, `npx`). | +| `args` | `string[]` | Yes — args to `command`. May contain the literal `${MCP_ROOT}` placeholder when a source is present. | + +The optional `type` discriminator selects the source shape: + +### CommandExecutorMcpServer (no `type`) + +No source — `command`/`args` are used as-is (e.g. a server launched via `npx`). +`${MCP_ROOT}` must NOT appear in `args` (there is nothing to substitute) — the +config validator rejects it. + +### LocalExecutorMcpServer (`type: "local"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"local"` | Yes | +| `path` | `string` | Yes — host directory tree uploaded into the sandbox | + +### GitExecutorMcpServer (`type: "git"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"git"` | Yes | +| `url` | `string` | Yes — git repository URL | +| `branch` | `string` | No | +| `subpath` | `string` | No — path within the repo to the server source | +| `sparse` | `string[]` | No — sparse checkout paths | + +### `${MCP_ROOT}` substitution + +For sourced servers (`local`/`git`), the source directory is uploaded into the +sandbox and the literal string `${MCP_ROOT}` in each `args` entry is replaced +with that server's absolute sandbox install dir at install time. Sourceless +servers cannot use the placeholder. + +### Per-adapter wiring + +| Adapter | Sandbox destination | Wiring | +|---|---|---| +| `claude` | sourced servers extracted to `$HOME/.mcp-servers//` (outside `/workspace`) | combined `$HOME/.mcp-servers/mcp-config.json` passed via the `--mcp-config` flag (no `--strict-mcp-config`) | +| `codex` | sourced servers extracted to `$CODEX_HOME/.mcp-servers//` | `[mcp_servers.]` block appended to `$CODEX_HOME/config.toml` (auto-read; existing content preserved) | +| `gemini` | — | Not supported in non-interactive mode. Adapter throws a clear error if `executorMcpServers` is non-empty. | +| custom | — | Not supported. Adapter throws a clear error if `executorMcpServers` is non-empty. | + ## Validation Rules 1. Root must be a JSON object @@ -198,6 +256,7 @@ A/B comparison won't silently no-op. 8. Custom agents must provide `envVar` and `baseUrl` in their secret 9. `baseUrl` must be a parseable URL 10. `executorPlugins`, if present, must be an array; each entry needs a `name` (slug-safe) and a valid `type` (`local` or `git`); names must be unique +11. `executorMcpServers`, if present, must be an array; each entry needs a slug-safe unique `name`, a string `command`, and a `string[]` `args`; `type` is optional but when present must be `local` (needs `path`) or `git` (needs `url`); when `type` is absent, `${MCP_ROOT}` must not appear in `args` ## Minimal Examples diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index 3d51a8a..e8eba5a 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -187,4 +187,45 @@ describe('ClaudeAdapter', () => { expect(cmd).not.toContain('--plugin-dir'); }); }); + + describe('installMcpServersInSandbox', () => { + it('is a no-op when given an empty server list', async () => { + const client = makeMockSandboxClient(); + await adapter.installMcpServersInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('uploads sourced servers, substitutes ${MCP_ROOT}, and writes mcp-config.json', async () => { + const client = makeMockSandboxClient(); + client.runCommand.mockResolvedValue({ stdout: '/root', stderr: '', exitCode: 0 }); + + await adapter.installMcpServersInSandbox(client as any, [ + { name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js', '--flag'], hostDir: '/tmp/mine' }, + { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + // Sourced server is uploaded outside /workspace. + expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/mine', '/root/.mcp-servers/mine', 'mcp_mine'); + // Sourceless server is not uploaded. + expect(mockUploadDir).toHaveBeenCalledTimes(1); + + // The base64-decoded MCP config JSON should reflect the substitution. + const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('base64 -d')); + expect(writeCall).toBeDefined(); + const b64 = String(writeCall![0]).match(/printf %s '([A-Za-z0-9+/=]+)'/)![1]; + const cfg = JSON.parse(Buffer.from(b64, 'base64').toString('utf-8')); + expect(cfg.mcpServers.mine.args).toEqual(['/root/.mcp-servers/mine/server.js', '--flag']); + expect(cfg.mcpServers.fs).toEqual({ command: 'npx', args: ['-y', 'server-filesystem'] }); + + // sandboxCommand emits --mcp-config, not --strict-mcp-config. + const cmd = adapter.sandboxCommand('do the thing'); + expect(cmd).toContain("--mcp-config '/root/.mcp-servers/mcp-config.json'"); + expect(cmd).not.toContain('--strict-mcp-config'); + }); + + it('sandboxCommand omits --mcp-config when no MCP servers were installed', () => { + const fresh = new ClaudeAdapter({ command: 'claude' }); + expect(fresh.sandboxCommand('go')).not.toContain('--mcp-config'); + }); + }); }); diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index b42b4f6..333b53e 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -230,4 +230,40 @@ describe('CodexAdapter', () => { ])).rejects.toThrow(/'plugin-a'.*'plugin-b'/); }); }); + + describe('installMcpServersInSandbox', () => { + it('is a no-op when given an empty server list', async () => { + const client = makeMockSandboxClient(); + await adapter.installMcpServersInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('uploads sourced servers, substitutes ${MCP_ROOT}, and appends config.toml blocks', async () => { + const client = makeMockSandboxClient(); + client.runCommand.mockResolvedValue({ stdout: '/root/.codex', stderr: '', exitCode: 0 }); + + await adapter.installMcpServersInSandbox(client as any, [ + { name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], hostDir: '/tmp/mine' }, + { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + expect(mockUploadDir).toHaveBeenCalledWith( + client, '/tmp/mine', '/root/.codex/.mcp-servers/mine', 'mcp_mine', + ); + expect(mockUploadDir).toHaveBeenCalledTimes(1); + + // The base64-decoded TOML appended to config.toml reflects the substitution. + const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('config.toml')); + expect(writeCall).toBeDefined(); + const b64 = String(writeCall![0]).match(/printf %s '([A-Za-z0-9+/=]+)'/)![1]; + const toml = Buffer.from(b64, 'base64').toString('utf-8'); + expect(toml).toContain('[mcp_servers.mine]'); + expect(toml).toContain('command = "node"'); + expect(toml).toContain('args = ["/root/.codex/.mcp-servers/mine/server.js"]'); + expect(toml).toContain('[mcp_servers.fs]'); + expect(toml).toContain('args = ["-y", "server-filesystem"]'); + // Append, not clobber. + expect(String(writeCall![0])).toContain('>>'); + }); + }); }); diff --git a/src/agents/adapter.ts b/src/agents/adapter.ts index c74e88c..295f826 100644 --- a/src/agents/adapter.ts +++ b/src/agents/adapter.ts @@ -1,4 +1,4 @@ -import { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { ClaudeAdapter } from './claude.js'; import { CodexAdapter } from './codex.js'; @@ -43,6 +43,16 @@ export interface AgentAdapter { * clear error here instead of silently succeeding. */ installPluginsInSandbox(client: MicrosandboxClient, plugins: ResolvedExecutorPlugin[]): Promise; + + /** + * Install MCP servers into the running sandbox so the agent CLI connects to + * them at startup. Parallel to `installPluginsInSandbox` but a separate + * mechanism — each adapter wires servers through its CLI's native MCP-config + * surface (Claude: `--mcp-config`, Codex: `[mcp_servers.*]` in `config.toml`). + * Adapters whose CLI cannot load MCP servers in non-interactive mode raise a + * clear error here instead of silently succeeding. + */ + installMcpServersInSandbox(client: MicrosandboxClient, servers: ResolvedExecutorMcpServer[]): Promise; } const KNOWN_ADAPTERS: Record AgentAdapter> = { diff --git a/src/agents/base.ts b/src/agents/base.ts index aa141a2..29d51c6 100644 --- a/src/agents/base.ts +++ b/src/agents/base.ts @@ -1,4 +1,4 @@ -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { AgentAdapter } from './adapter.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { spawnAgent, spawnInteractive } from './spawn.js'; @@ -102,6 +102,23 @@ export abstract class BaseAdapter implements AgentAdapter { ); } + /** + * Install MCP servers into the sandbox VM. Subclasses override with + * CLI-specific wiring (Claude: `--mcp-config`, Codex: `config.toml`). + * Default raises a clear error so adapters that don't support MCP servers + * fail loudly when the user wires `executorMcpServers` against them. + */ + async installMcpServersInSandbox( + _client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + throw new Error( + `Agent adapter '${this.name}' does not support executorMcpServers. ` + + `Either remove executorMcpServers from config or switch executor to an adapter that supports MCP servers.`, + ); + } + /** Shared helper: spawn the agent process with piped stdio. */ protected spawn(args: string[], workDir: string, env?: Record, timeout?: number, stdin?: string): Promise { return spawnAgent(this.config.command, args, { cwd: workDir, env, timeout, stdin }); diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 47d77fc..d813654 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -1,6 +1,6 @@ import { access } from 'node:fs/promises'; import { join } from 'node:path'; -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; @@ -19,6 +19,13 @@ export class ClaudeAdapter extends BaseAdapter { */ private installedPluginDirs: string[] = []; + /** + * Sandbox path of the MCP config JSON written by `installMcpServersInSandbox()`. + * `sandboxCommand()` reads this to emit a `--mcp-config ` flag so the + * Claude CLI connects to each MCP server for the run. Null until install. + */ + private installedMcpConfigPath: string | null = null; + constructor(config: AgentConfig) { super(config); } @@ -36,7 +43,13 @@ export class ClaudeAdapter extends BaseAdapter { const pluginFlags = this.installedPluginDirs .map((dir) => ` --plugin-dir '${dir}'`) .join(''); - const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions${pluginFlags} ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); + // --mcp-config points the CLI at a JSON file of MCP servers to connect for + // the session. We intentionally do NOT pass --strict-mcp-config so the + // agent keeps any ambient MCP config alongside our injected servers. + const mcpFlag = this.installedMcpConfigPath + ? ` --mcp-config '${this.installedMcpConfigPath}'` + : ''; + const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions${pluginFlags}${mcpFlag} ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); return cmd; } @@ -143,6 +156,53 @@ export class ClaudeAdapter extends BaseAdapter { })); } + /** + * Install MCP servers for the executor's Claude CLI session. + * + * Sourced servers are uploaded under `$HOME/.mcp-servers//` (outside + * `/workspace` so the agent's workspace stays clean). `${MCP_ROOT}` in each + * server's args is substituted with that server's sandbox install dir. A + * combined MCP config JSON is written to `$HOME/.mcp-servers/mcp-config.json` + * and surfaced to `sandboxCommand()` as a `--mcp-config` flag. + */ + async installMcpServersInSandbox( + client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; + const mcpRoot = `${home}/.mcp-servers`; + + const mcpServers: Record = {}; + + for (const server of servers) { + let args = server.args; + if (server.hostDir) { + const destDir = `${mcpRoot}/${server.name}`; + await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`); + args = args.map((a) => a.split('${MCP_ROOT}').join(destDir)); + } + mcpServers[server.name] = { command: server.command, args }; + } + + const configJson = JSON.stringify({ mcpServers }, null, 2); + const configPath = `${mcpRoot}/mcp-config.json`; + const encoded = Buffer.from(configJson, 'utf-8').toString('base64'); + const writeResult = await client.runCommand( + `mkdir -p '${mcpRoot}' && printf %s '${encoded}' | base64 -d > '${configPath}'`, + ); + if (writeResult.exitCode !== 0) { + throw new Error( + `Failed to write MCP config into sandbox at '${configPath}': ` + + `${writeResult.stderr || writeResult.stdout}`, + ); + } + + this.installedMcpConfigPath = configPath; + } + protected parseEnvelope(result: AgentResult): AgentResult | null { try { const envelope = JSON.parse(result.stdout); diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 0997e72..d847b87 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -1,7 +1,7 @@ import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { join } from 'node:path'; import { tmpdir } from 'node:os'; -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; @@ -192,6 +192,51 @@ export class CodexAdapter extends BaseAdapter { )); } + /** + * Install MCP servers for the executor's Codex CLI session. + * + * Sourced servers are uploaded under `$CODEX_HOME/.mcp-servers//`. + * `${MCP_ROOT}` in each server's args is substituted with that server's + * sandbox install dir. An `[mcp_servers.]` TOML block (command + args) + * is appended to `$CODEX_HOME/config.toml`, which Codex auto-reads. Existing + * config content is preserved — blocks are appended, never clobbered. + */ + async installMcpServersInSandbox( + client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + + const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); + const codexHome = homeResult.stdout.trim() || '/root/.codex'; + const mcpRoot = `${codexHome}/.mcp-servers`; + + const tomlBlocks: string[] = []; + + for (const server of servers) { + let args = server.args; + if (server.hostDir) { + const destDir = `${mcpRoot}/${server.name}`; + await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`); + args = args.map((a) => a.split('${MCP_ROOT}').join(destDir)); + } + tomlBlocks.push(renderMcpServerToml(server.name, server.command, args)); + } + + const configPath = `${codexHome}/config.toml`; + const tomlText = `\n${tomlBlocks.join('\n')}\n`; + const encoded = Buffer.from(tomlText, 'utf-8').toString('base64'); + const writeResult = await client.runCommand( + `mkdir -p '${codexHome}' && touch '${configPath}' && printf %s '${encoded}' | base64 -d >> '${configPath}'`, + ); + if (writeResult.exitCode !== 0) { + throw new Error( + `Failed to append MCP servers to Codex config at '${configPath}': ` + + `${writeResult.stderr || writeResult.stdout}`, + ); + } + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.codex/sessions/*.jsonl' -type f 2>/dev/null | sort | tail -1", @@ -216,3 +261,24 @@ export class CodexAdapter extends BaseAdapter { } } } + +/** Encode a JS string as a TOML basic string (double-quoted, escaped). */ +function tomlString(value: string): string { + const escaped = value + .replace(/\\/g, '\\\\') + .replace(/"/g, '\\"') + .replace(/\n/g, '\\n') + .replace(/\t/g, '\\t') + .replace(/\r/g, '\\r'); + return `"${escaped}"`; +} + +/** Render an `[mcp_servers.]` TOML block with command + args. */ +function renderMcpServerToml(name: string, command: string, args: string[]): string { + const argsArray = args.map(tomlString).join(', '); + return [ + `[mcp_servers.${name}]`, + `command = ${tomlString(command)}`, + `args = [${argsArray}]`, + ].join('\n'); +} diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index 03d3dcc..db3bded 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -1,6 +1,6 @@ import { access } from 'node:fs/promises'; import { join } from 'node:path'; -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; @@ -90,6 +90,23 @@ export class GeminiAdapter extends BaseAdapter { )); } + /** + * Gemini CLI loads MCP servers from `settings.json`, but in non-interactive + * (`--yolo -p`) mode it cannot complete the per-server trust handshake the + * CLI requires, so injected MCP servers would not be reliably connected for + * the run. We fail loudly rather than silently no-op an A/B comparison. + */ + async installMcpServersInSandbox( + _client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + throw new Error( + `Agent adapter 'gemini' does not support executorMcpServers in non-interactive mode. ` + + `Remove executorMcpServers from config or switch the executor to an adapter that does (claude, codex).`, + ); + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.gemini/tmp/*/chats/session-*.jsonl' -type f 2>/dev/null | sort | tail -1", diff --git a/src/commands/execute.ts b/src/commands/execute.ts index a327c57..c1d94fc 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -5,7 +5,7 @@ import { loadConfig } from '../core/config.js'; import { loadTestSuite, saveResult, saveBinaryResult, formatElapsed } from '../core/suite-io.js'; import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; -import { scaffoldWorkspace, resolveExecutorPlugins } from '../sandbox/scaffolding.js'; +import { scaffoldWorkspace, resolveExecutorPlugins, resolveExecutorMcpServers } from '../sandbox/scaffolding.js'; import { WorkerPool } from '../sandbox/worker-pool.js'; import { createAdapter } from '../agents/adapter.js'; import { getPackageSource, getUrlSources, getFileSources } from '../types.js'; @@ -211,6 +211,19 @@ export async function executeTestCase( } } + // MCP servers are also executor-only — parallel to executorPlugins but a + // separate mechanism. The judge sandbox stays MCP-free. + if (config.executorMcpServers && config.executorMcpServers.length > 0) { + const resolvedMcpServers = await resolveExecutorMcpServers(config.executorMcpServers, paths.cacheRepos); + try { + await adapter.installMcpServersInSandbox(client, resolvedMcpServers); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + await saveResult(paths, testCase.id, 'mcp-server-install-error.log', message, target.name); + throw new Error(`Executor MCP server install failed: ${message}`); + } + } + // Upload public file sources (docs directories, etc.) for the executor const publicFileSources = getFileSources(config.publicInfo ?? []); const publicSourceDirs = publicFileSources.length > 0 diff --git a/src/core/__tests__/config.test.ts b/src/core/__tests__/config.test.ts index 0efe953..5529099 100644 --- a/src/core/__tests__/config.test.ts +++ b/src/core/__tests__/config.test.ts @@ -325,4 +325,116 @@ describe('loadConfig', () => { await expect(loadConfig('/fake/config.json')).rejects.toThrow(/duplicated/); }); }); + + describe('executorMcpServers', () => { + it('accepts a sourceless server (no type)', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { name: 'fs', command: 'npx', args: ['-y', '@modelcontextprotocol/server-filesystem', '/workspace'] }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorMcpServers).toHaveLength(1); + expect(result.executorMcpServers?.[0]).toMatchObject({ name: 'fs', command: 'npx' }); + }); + + it('accepts a local server with ${MCP_ROOT} in args', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { type: 'local', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], path: '/tmp/mine' }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorMcpServers).toHaveLength(1); + }); + + it('accepts a git server with branch + subpath', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { type: 'git', name: 'shared', command: 'node', args: ['${MCP_ROOT}/index.js'], + url: 'https://example.com/repo.git', branch: 'main', subpath: 'mcp' }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorMcpServers).toHaveLength(1); + }); + + it('throws when executorMcpServers is not an array', async () => { + const config = { ...validConfig, executorMcpServers: 'not-an-array' }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/executorMcpServers must be an array/); + }); + + it('throws when a server has an invalid name', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'bad name; rm -rf /', command: 'node', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/unsupported characters/); + }); + + it('throws when a server is missing command', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'a', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/command/); + }); + + it('throws when a server has non-array args', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'a', command: 'node', args: 'oops' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/args/); + }); + + it('throws when ${MCP_ROOT} is used without a source', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'a', command: 'node', args: ['${MCP_ROOT}/server.js'] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/MCP_ROOT/); + }); + + it('throws when a local server is missing path', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ type: 'local', name: 'a', command: 'node', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/path/); + }); + + it('throws when a server has an invalid type', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ type: 'url', name: 'a', command: 'node', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/invalid/i); + }); + + it('throws when two servers share the same name', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { name: 'a', command: 'node', args: [] }, + { name: 'a', command: 'npx', args: [] }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/duplicated/); + }); + }); }); diff --git a/src/core/config.ts b/src/core/config.ts index cb03cc6..578e95b 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -65,6 +65,63 @@ function validateExecutorPluginEntry(plugin: Record, prefix: st } } +const MCP_ROOT_PLACEHOLDER = '${MCP_ROOT}'; + +function validateExecutorMcpServerEntry(server: Record, prefix: string): void { + if (!server || typeof server !== 'object' || Array.isArray(server)) { + throw new Error(`${prefix} must be an object`); + } + + if (!server.name || typeof server.name !== 'string') { + throw new Error(`${prefix} requires a non-empty 'name' string`); + } + if (!/^[a-zA-Z0-9._-]+$/.test(server.name)) { + throw new Error( + `${prefix}.name '${server.name}' contains unsupported characters. ` + + `Use only letters, digits, '.', '_', or '-'.`, + ); + } + + if (!server.command || typeof server.command !== 'string') { + throw new Error(`${prefix} requires a non-empty 'command' string`); + } + + if (!Array.isArray(server.args) || !server.args.every((a) => typeof a === 'string')) { + throw new Error(`${prefix} requires 'args' to be an array of strings`); + } + const args = server.args as string[]; + + // `type` is OPTIONAL for MCP servers. When present it must be local|git. + if (server.type !== undefined) { + if (typeof server.type !== 'string' || !VALID_PLUGIN_TYPES.includes(server.type)) { + throw new Error( + `${prefix}.type '${String(server.type)}' is invalid. Must be one of: ` + + `${VALID_PLUGIN_TYPES.map(t => `'${t}'`).join(', ')} (or omitted for a sourceless server)`, + ); + } + switch (server.type) { + case 'local': + if (!server.path || typeof server.path !== 'string') { + throw new Error(`${prefix} type 'local' requires path to be set`); + } + break; + case 'git': + if (!server.url || typeof server.url !== 'string') { + throw new Error(`${prefix} type 'git' requires url to be set`); + } + break; + } + } else { + // Sourceless server — `${MCP_ROOT}` cannot be substituted, so reject it. + if (args.some((a) => a.includes(MCP_ROOT_PLACEHOLDER))) { + throw new Error( + `${prefix} uses the '${MCP_ROOT_PLACEHOLDER}' placeholder but has no source ` + + `('type' is omitted). Add a 'local' or 'git' source, or remove the placeholder.`, + ); + } + } +} + function validateSourceEntry(source: Record, prefix: string): void { if (!source || typeof source !== 'object' || Array.isArray(source)) { throw new Error(`${prefix} must be an object`); @@ -147,6 +204,23 @@ export function validateConfig(data: unknown, configPath?: string): Config { } } + // Validate executorMcpServers (optional) + if (obj.executorMcpServers !== undefined) { + if (!Array.isArray(obj.executorMcpServers)) { + throw new Error('Config executorMcpServers must be an array'); + } + const seenMcpNames = new Set(); + for (let i = 0; i < obj.executorMcpServers.length; i++) { + const entry = obj.executorMcpServers[i] as Record; + validateExecutorMcpServerEntry(entry, `executorMcpServers[${i}]`); + const name = entry.name as string; + if (seenMcpNames.has(name)) { + throw new Error(`executorMcpServers[${i}].name '${name}' is duplicated`); + } + seenMcpNames.add(name); + } + } + // Validate targets if (!Array.isArray(obj.targets) || obj.targets.length === 0) { throw new Error('Config requires at least one target in targets array'); diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index eabb9dc..e31b22b 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -3,7 +3,10 @@ import { execFile } from 'node:child_process'; import { basename, join, relative, resolve as resolvePath } from 'node:path'; import { tmpdir } from 'node:os'; import type { MicrosandboxClient } from './microsandbox.js'; -import type { Config, TestCase, SourceConfig, ExecutorPlugin, ResolvedExecutorPlugin } from '../types.js'; +import type { + Config, TestCase, SourceConfig, ExecutorPlugin, ResolvedExecutorPlugin, + ExecutorMcpServer, ResolvedExecutorMcpServer, +} from '../types.js'; import { resolveSources, resolveSource } from '../core/source-resolver.js'; /** Directories excluded from source uploads — these are large and not useful for evaluation. */ @@ -301,3 +304,41 @@ export async function resolveExecutorPlugins( return { name: plugin.name, hostDir: resolvePath(hostDir) }; })); } + +/** + * Resolve `config.executorMcpServers` to host-side directories. Mirrors + * `resolveExecutorPlugins`: entries with a source (local/git) resolve via the + * shared source-resolver (git clones cached under `cacheRepos`); sourceless + * entries (no `type`) carry no `hostDir`. + * + * Returns an empty array when no MCP servers are configured. + */ +export async function resolveExecutorMcpServers( + servers: ExecutorMcpServer[] | undefined, + cacheRepos: string, +): Promise { + if (!servers || servers.length === 0) return []; + + return Promise.all(servers.map(async (server): Promise => { + if (server.type === undefined) { + // Sourceless — command/args used as-is. + return { name: server.name, command: server.command, args: server.args }; + } + const source: SourceConfig = server.type === 'local' + ? { type: 'local', path: server.path } + : { + type: 'git', + url: server.url, + branch: server.branch, + subpath: server.subpath, + sparse: server.sparse, + }; + const hostDir = await resolveSource(source, { reposDir: cacheRepos }); + return { + name: server.name, + command: server.command, + args: server.args, + hostDir: resolvePath(hostDir), + }; + })); +} diff --git a/src/scoring/__tests__/judge.test.ts b/src/scoring/__tests__/judge.test.ts index 0a3691f..be5eb66 100644 --- a/src/scoring/__tests__/judge.test.ts +++ b/src/scoring/__tests__/judge.test.ts @@ -72,6 +72,7 @@ function makeMockAdapter(opts: { stdout: string }) { extractResult: vi.fn().mockImplementation((stdout: string) => stdout), extractLog: vi.fn().mockResolvedValue(null), installPluginsInSandbox: vi.fn().mockResolvedValue(undefined), + installMcpServersInSandbox: vi.fn().mockResolvedValue(undefined), }; } diff --git a/src/types.ts b/src/types.ts index c6739ff..bc0c184 100644 --- a/src/types.ts +++ b/src/types.ts @@ -183,6 +183,70 @@ export interface ResolvedExecutorPlugin { hostDir: string; } +/** + * Source of an MCP server to install into the executor's agent CLI. + * + * Independent of `ExecutorPlugin` — MCP servers are wired through each CLI's + * native MCP-config surface, NOT through agent plugins (a Claude plugin's + * `mcpServers` are not loaded by `claude --plugin-dir`). + * + * Three shapes, discriminated by the optional `type` field: + * - `LocalExecutorMcpServer` (`type:'local'`) — upload a host directory tree. + * - `GitExecutorMcpServer` (`type:'git'`) — clone a git source. + * - `CommandExecutorMcpServer` (no `type`) — sourceless; `command`/`args` + * are used as-is (e.g. an MCP server launched via `npx`). + * + * All three share `name`/`command`/`args`. `args` may contain the literal + * placeholder `${MCP_ROOT}`, substituted at install time with the absolute + * sandbox path of the uploaded source directory — only valid when there IS a + * source. + */ +export interface BaseExecutorMcpServer { + /** MCP server slug — used as the server name in MCP config and the install dir name. */ + name: string; + /** Executable that launches the MCP server (e.g. "node", "npx"). */ + command: string; + /** Args passed to `command`. May contain `${MCP_ROOT}` when a source is present. */ + args: string[]; +} + +export interface LocalExecutorMcpServer extends BaseExecutorMcpServer { + type: 'local'; + /** Absolute or relative path to a host directory tree uploaded into the sandbox. */ + path: string; +} + +export interface GitExecutorMcpServer extends BaseExecutorMcpServer { + type: 'git'; + url: string; + branch?: string; + /** Path within the cloned repo that contains the MCP server source. */ + subpath?: string; + sparse?: string[]; +} + +export interface CommandExecutorMcpServer extends BaseExecutorMcpServer { + type?: undefined; +} + +export type ExecutorMcpServer = + | LocalExecutorMcpServer + | GitExecutorMcpServer + | CommandExecutorMcpServer; + +/** + * An ExecutorMcpServer after host-side resolution. The adapter receives this + * and decides how to wire it into the sandbox VM. `hostDir` is present only + * when the entry had a source (local/git); sourceless entries omit it. + */ +export interface ResolvedExecutorMcpServer { + name: string; + command: string; + args: string[]; + /** Absolute host path to the MCP server source — only when there was a source. */ + hostDir?: string; +} + export interface SecretConfig { /** Raw value or "$ENV_VAR" reference resolved from host environment. */ value: string; @@ -223,6 +287,15 @@ export interface Config { * with these so judge scoring stays independent of the executor's tooling. */ executorPlugins?: ExecutorPlugin[]; + /** + * MCP servers to install into the executor agent's CLI inside the sandbox VM. + * Parallel to `executorPlugins` but a separate mechanism — each adapter wires + * these through its CLI's native MCP-config surface (Claude: `--mcp-config`, + * Codex: `[mcp_servers.*]` in `config.toml`). MCP servers are scoped to the + * executor — the judge sandbox is intentionally not seeded with them so judge + * scoring stays independent of the executor's tooling. + */ + executorMcpServers?: ExecutorMcpServer[]; sandbox: SandboxConfig; } From 58b5ab87c3bcd07e534b7e07346882ba15100514 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Mon, 18 May 2026 11:18:40 +1200 Subject: [PATCH 2/5] fix(executor): upload MCP server payloads verbatim (incl. node_modules) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit installMcpServersInSandbox uploaded via getSourceArchive, whose EXCLUDED_DIRS strips node_modules — correct for SDK source trees, fatal for a runnable MCP server, which then crashes on spawn (ERR_MODULE_NOT_FOUND) and fails the client handshake. Add an `includeAll` option to getSourceArchive / uploadDirToSandbox and use it for MCP server uploads. Verified in a real microVM: the web-mcp handshake now completes with all tools. Co-Authored-By: Claude Sonnet 4.6 --- src/agents/__tests__/claude.test.ts | 4 ++-- src/agents/__tests__/codex.test.ts | 2 +- src/agents/claude.ts | 4 +++- src/agents/codex.ts | 4 +++- src/sandbox/scaffolding.ts | 33 ++++++++++++++++++++++------- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index e8eba5a..5575526 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -204,8 +204,8 @@ describe('ClaudeAdapter', () => { { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, ]); - // Sourced server is uploaded outside /workspace. - expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/mine', '/root/.mcp-servers/mine', 'mcp_mine'); + // Sourced server is uploaded outside /workspace, verbatim (incl. node_modules). + expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/mine', '/root/.mcp-servers/mine', 'mcp_mine', { includeAll: true }); // Sourceless server is not uploaded. expect(mockUploadDir).toHaveBeenCalledTimes(1); diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index 333b53e..f37b216 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -248,7 +248,7 @@ describe('CodexAdapter', () => { ]); expect(mockUploadDir).toHaveBeenCalledWith( - client, '/tmp/mine', '/root/.codex/.mcp-servers/mine', 'mcp_mine', + client, '/tmp/mine', '/root/.codex/.mcp-servers/mine', 'mcp_mine', { includeAll: true }, ); expect(mockUploadDir).toHaveBeenCalledTimes(1); diff --git a/src/agents/claude.ts b/src/agents/claude.ts index d813654..a9ad55e 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -181,7 +181,9 @@ export class ClaudeAdapter extends BaseAdapter { let args = server.args; if (server.hostDir) { const destDir = `${mcpRoot}/${server.name}`; - await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`); + // includeAll: an MCP server is a runnable artifact — it needs its + // node_modules, which the default source-archive exclusion strips. + await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`, { includeAll: true }); args = args.map((a) => a.split('${MCP_ROOT}').join(destDir)); } mcpServers[server.name] = { command: server.command, args }; diff --git a/src/agents/codex.ts b/src/agents/codex.ts index d847b87..7ca9709 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -217,7 +217,9 @@ export class CodexAdapter extends BaseAdapter { let args = server.args; if (server.hostDir) { const destDir = `${mcpRoot}/${server.name}`; - await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`); + // includeAll: an MCP server is a runnable artifact — it needs its + // node_modules, which the default source-archive exclusion strips. + await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`, { includeAll: true }); args = args.map((a) => a.split('${MCP_ROOT}').join(destDir)); } tomlBlocks.push(renderMcpServerToml(server.name, server.command, args)); diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index e31b22b..18848f9 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -82,24 +82,37 @@ async function readDirRecursive( } /** - * Create a tar.gz archive of a directory, excluding common bloat dirs. - * Archives are cached on disk so concurrent sandboxes reuse the same tarball. + * Create a tar.gz archive of a directory. + * + * By default, common bloat dirs (`node_modules`, `.git`, build outputs, …) are + * excluded — correct for SDK *source* uploads. Pass `{ includeAll: true }` to + * archive the tree verbatim: required for runnable artifacts like MCP server + * payloads, which need their `node_modules` to actually start. + * + * Archives are cached on disk so concurrent sandboxes reuse the same tarball; + * the cache key includes the `includeAll` flag so source/full archives of the + * same path don't collide. */ -export async function getSourceArchive(srcPath: string): Promise { - const cached = sourceArchiveCache.get(srcPath); +export async function getSourceArchive( + srcPath: string, + opts?: { includeAll?: boolean }, +): Promise { + const includeAll = opts?.includeAll ?? false; + const cacheKey = `${srcPath}::${includeAll ? 'all' : 'src'}`; + const cached = sourceArchiveCache.get(cacheKey); if (cached) { try { await fsStat(cached); return cached; } catch { - sourceArchiveCache.delete(srcPath); + sourceArchiveCache.delete(cacheKey); } } const dirName = basename(srcPath); const tarPath = join(tmpdir(), `agentic-sources-${dirName}-${Date.now()}.tar.gz`); - const excludeArgs = [ + const excludeArgs = includeAll ? [] : [ ...EXCLUDED_DIRS.flatMap((d) => ['--exclude', d]), ...EXCLUDED_EXTENSIONS.flatMap((ext) => ['--exclude', ext]), ]; @@ -114,7 +127,7 @@ export async function getSourceArchive(srcPath: string): Promise { }); }); - sourceArchiveCache.set(srcPath, tarPath); + sourceArchiveCache.set(cacheKey, tarPath); return tarPath; } @@ -125,14 +138,18 @@ export async function getSourceArchive(srcPath: string): Promise { * * `archiveLabel` should be a slug-safe identifier (used only to name the * temporary tarball path inside the sandbox). + * + * Pass `{ includeAll: true }` to upload the tree verbatim (no `node_modules` + * exclusion) — required for runnable artifacts such as MCP server payloads. */ export async function uploadDirToSandbox( client: MicrosandboxClient, hostDir: string, sandboxDestDir: string, archiveLabel: string, + opts?: { includeAll?: boolean }, ): Promise { - const tarPath = await getSourceArchive(hostDir); + const tarPath = await getSourceArchive(hostDir, opts); const tarData = await readFile(tarPath); const sandboxTarPath = `/tmp/_${archiveLabel}.tar.gz`; await client.uploadBinaryFile(sandboxTarPath, tarData); From 3634fb428f7e838cdb211c4ac3d8f16a8dc49c62 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Thu, 21 May 2026 17:03:01 +1200 Subject: [PATCH 3/5] refactor(executor): tighten MCP server install path post-review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-review cleanup on the MCP server feature, addressing findings from a multi-agent review pass (code-reuse / code-quality / efficiency): - Parallelize per-server uploads in ClaudeAdapter and CodexAdapter installMcpServersInSandbox — was a serial for-await loop, now Promise.all. N-server installs no longer pay N× upload latency. - Run the executorPlugins and executorMcpServers install blocks in execute.ts concurrently via Promise.all — they write to disjoint sandbox dirs and disjoint adapter state. - Extract src/sandbox/mcp.ts as the home for MCP-server install primitives: MCP_ROOT_PLACEHOLDER constant, substituteMcpRoot helper, and uploadMcpServerSources (the shared upload + ${MCP_ROOT}-substitution loop). Claude / Codex adapters now own only their CLI-specific serializer (JSON config vs TOML append) and writer. - Make ResolvedExecutorMcpServer a discriminated union (kind:'sourced'|'sourceless') instead of an optional hostDir. Consumers narrow at compile time rather than probing optional fields. - Lift MCP_ROOT_PLACEHOLDER out of config.ts so config validation and the adapters' substitution use the same literal. No behaviour change beyond the parallelization. Type-check clean; 368/368 tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agents/__tests__/claude.test.ts | 4 +-- src/agents/__tests__/codex.test.ts | 4 +-- src/agents/claude.ts | 16 +++------ src/agents/codex.ts | 17 +++------- src/commands/execute.ts | 50 +++++++++++++++++------------ src/core/config.ts | 3 +- src/sandbox/mcp.ts | 44 +++++++++++++++++++++++++ src/sandbox/scaffolding.ts | 4 +-- src/types.ts | 30 +++++++++++------ 9 files changed, 110 insertions(+), 62 deletions(-) create mode 100644 src/sandbox/mcp.ts diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index 5575526..8a7eed5 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -200,8 +200,8 @@ describe('ClaudeAdapter', () => { client.runCommand.mockResolvedValue({ stdout: '/root', stderr: '', exitCode: 0 }); await adapter.installMcpServersInSandbox(client as any, [ - { name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js', '--flag'], hostDir: '/tmp/mine' }, - { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js', '--flag'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, ]); // Sourced server is uploaded outside /workspace, verbatim (incl. node_modules). diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index f37b216..c3e7b78 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -243,8 +243,8 @@ describe('CodexAdapter', () => { client.runCommand.mockResolvedValue({ stdout: '/root/.codex', stderr: '', exitCode: 0 }); await adapter.installMcpServersInSandbox(client as any, [ - { name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], hostDir: '/tmp/mine' }, - { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, ]); expect(mockUploadDir).toHaveBeenCalledWith( diff --git a/src/agents/claude.ts b/src/agents/claude.ts index a9ad55e..2fea940 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -3,6 +3,7 @@ import { join } from 'node:path'; import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../sandbox/mcp.js'; import { BaseAdapter } from './base.js'; export class ClaudeAdapter extends BaseAdapter { @@ -175,18 +176,11 @@ export class ClaudeAdapter extends BaseAdapter { const home = homeResult.stdout.trim() || '/root'; const mcpRoot = `${home}/.mcp-servers`; - const mcpServers: Record = {}; + const resolved = await uploadMcpServerSources(client, mcpRoot, servers); - for (const server of servers) { - let args = server.args; - if (server.hostDir) { - const destDir = `${mcpRoot}/${server.name}`; - // includeAll: an MCP server is a runnable artifact — it needs its - // node_modules, which the default source-archive exclusion strips. - await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`, { includeAll: true }); - args = args.map((a) => a.split('${MCP_ROOT}').join(destDir)); - } - mcpServers[server.name] = { command: server.command, args }; + const mcpServers: Record = {}; + for (const server of resolved) { + mcpServers[server.name] = { command: server.command, args: server.args }; } const configJson = JSON.stringify({ mcpServers }, null, 2); diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 7ca9709..0d80c4c 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -4,6 +4,7 @@ import { tmpdir } from 'node:os'; import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../sandbox/mcp.js'; import { BaseAdapter } from './base.js'; export class CodexAdapter extends BaseAdapter { @@ -211,19 +212,11 @@ export class CodexAdapter extends BaseAdapter { const codexHome = homeResult.stdout.trim() || '/root/.codex'; const mcpRoot = `${codexHome}/.mcp-servers`; - const tomlBlocks: string[] = []; + const resolved = await uploadMcpServerSources(client, mcpRoot, servers); - for (const server of servers) { - let args = server.args; - if (server.hostDir) { - const destDir = `${mcpRoot}/${server.name}`; - // includeAll: an MCP server is a runnable artifact — it needs its - // node_modules, which the default source-archive exclusion strips. - await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`, { includeAll: true }); - args = args.map((a) => a.split('${MCP_ROOT}').join(destDir)); - } - tomlBlocks.push(renderMcpServerToml(server.name, server.command, args)); - } + const tomlBlocks = resolved.map((server) => + renderMcpServerToml(server.name, server.command, server.args), + ); const configPath = `${codexHome}/config.toml`; const tomlText = `\n${tomlBlocks.join('\n')}\n`; diff --git a/src/commands/execute.ts b/src/commands/execute.ts index c1d94fc..1bef73b 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -198,30 +198,38 @@ export async function executeTestCase( } } - // Plugins are intentionally executor-only — the judge sandbox stays - // plugin-free so its scoring is independent of the executor's tooling. + // Plugins and MCP servers are both executor-only — the judge sandbox stays + // free of either so its scoring is independent of the executor's tooling. + // Run the two install blocks concurrently: they write to disjoint sandbox + // dirs (plugins under the CLI's plugin dir; MCP servers under .mcp-servers) + // and update disjoint adapter state. + const installTasks: Promise[] = []; if (config.executorPlugins && config.executorPlugins.length > 0) { - const resolvedPlugins = await resolveExecutorPlugins(config.executorPlugins, paths.cacheRepos); - try { - await adapter.installPluginsInSandbox(client, resolvedPlugins); - } catch (err) { - const message = err instanceof Error ? err.message : String(err); - await saveResult(paths, testCase.id, 'plugin-install-error.log', message, target.name); - throw new Error(`Executor plugin install failed: ${message}`); - } + installTasks.push((async () => { + const resolvedPlugins = await resolveExecutorPlugins(config.executorPlugins, paths.cacheRepos); + try { + await adapter.installPluginsInSandbox(client, resolvedPlugins); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + await saveResult(paths, testCase.id, 'plugin-install-error.log', message, target.name); + throw new Error(`Executor plugin install failed: ${message}`); + } + })()); } - - // MCP servers are also executor-only — parallel to executorPlugins but a - // separate mechanism. The judge sandbox stays MCP-free. if (config.executorMcpServers && config.executorMcpServers.length > 0) { - const resolvedMcpServers = await resolveExecutorMcpServers(config.executorMcpServers, paths.cacheRepos); - try { - await adapter.installMcpServersInSandbox(client, resolvedMcpServers); - } catch (err) { - const message = err instanceof Error ? err.message : String(err); - await saveResult(paths, testCase.id, 'mcp-server-install-error.log', message, target.name); - throw new Error(`Executor MCP server install failed: ${message}`); - } + installTasks.push((async () => { + const resolvedMcpServers = await resolveExecutorMcpServers(config.executorMcpServers, paths.cacheRepos); + try { + await adapter.installMcpServersInSandbox(client, resolvedMcpServers); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + await saveResult(paths, testCase.id, 'mcp-server-install-error.log', message, target.name); + throw new Error(`Executor MCP server install failed: ${message}`); + } + })()); + } + if (installTasks.length > 0) { + await Promise.all(installTasks); } // Upload public file sources (docs directories, etc.) for the executor diff --git a/src/core/config.ts b/src/core/config.ts index 578e95b..3976717 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -1,6 +1,7 @@ import { readFile } from 'node:fs/promises'; import { Config, AgentConfig } from '../types.js'; import { createAdapter } from '../agents/adapter.js'; +import { MCP_ROOT_PLACEHOLDER } from '../sandbox/mcp.js'; export async function loadConfig(configPath: string): Promise { let raw: string; @@ -65,8 +66,6 @@ function validateExecutorPluginEntry(plugin: Record, prefix: st } } -const MCP_ROOT_PLACEHOLDER = '${MCP_ROOT}'; - function validateExecutorMcpServerEntry(server: Record, prefix: string): void { if (!server || typeof server !== 'object' || Array.isArray(server)) { throw new Error(`${prefix} must be an object`); diff --git a/src/sandbox/mcp.ts b/src/sandbox/mcp.ts new file mode 100644 index 0000000..1b6abe6 --- /dev/null +++ b/src/sandbox/mcp.ts @@ -0,0 +1,44 @@ +import type { ResolvedExecutorMcpServer } from '../types.js'; +import type { MicrosandboxClient } from './microsandbox.js'; +import { uploadDirToSandbox } from './scaffolding.js'; + +export const MCP_ROOT_PLACEHOLDER = '${MCP_ROOT}'; + +export function substituteMcpRoot(args: string[], destDir: string): string[] { + return args.map((a) => a.split(MCP_ROOT_PLACEHOLDER).join(destDir)); +} + +export interface ResolvedMcpServerArgs { + name: string; + command: string; + args: string[]; +} + +/** + * Upload sourced MCP server payloads in parallel and return per-server resolved + * args (with `${MCP_ROOT}` substituted for the sandbox install dir). Sourceless + * entries pass through with their original args. Each adapter then writes its + * CLI-specific MCP config from the result. + * + * `{ includeAll: true }` on the upload is load-bearing — an MCP server is a + * runnable artifact and needs its `node_modules`, which the default + * source-archive exclusion strips. + */ +export async function uploadMcpServerSources( + client: MicrosandboxClient, + mcpRoot: string, + servers: ResolvedExecutorMcpServer[], +): Promise { + return Promise.all(servers.map(async (server) => { + if (server.kind === 'sourceless') { + return { name: server.name, command: server.command, args: server.args }; + } + const destDir = `${mcpRoot}/${server.name}`; + await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`, { includeAll: true }); + return { + name: server.name, + command: server.command, + args: substituteMcpRoot(server.args, destDir), + }; + })); +} diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index 18848f9..d33e03f 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -338,8 +338,7 @@ export async function resolveExecutorMcpServers( return Promise.all(servers.map(async (server): Promise => { if (server.type === undefined) { - // Sourceless — command/args used as-is. - return { name: server.name, command: server.command, args: server.args }; + return { kind: 'sourceless', name: server.name, command: server.command, args: server.args }; } const source: SourceConfig = server.type === 'local' ? { type: 'local', path: server.path } @@ -352,6 +351,7 @@ export async function resolveExecutorMcpServers( }; const hostDir = await resolveSource(source, { reposDir: cacheRepos }); return { + kind: 'sourced', name: server.name, command: server.command, args: server.args, diff --git a/src/types.ts b/src/types.ts index bc0c184..4543b65 100644 --- a/src/types.ts +++ b/src/types.ts @@ -235,17 +235,27 @@ export type ExecutorMcpServer = | CommandExecutorMcpServer; /** - * An ExecutorMcpServer after host-side resolution. The adapter receives this - * and decides how to wire it into the sandbox VM. `hostDir` is present only - * when the entry had a source (local/git); sourceless entries omit it. + * An ExecutorMcpServer after host-side resolution. Discriminated on `kind` so + * consumers narrow at compile time instead of probing an optional `hostDir`. + * `sourced` entries were `local`/`git` in the input and carry a host directory + * to upload; `sourceless` entries had no input `type` and run from `command` + * alone (e.g. `npx server-name`). */ -export interface ResolvedExecutorMcpServer { - name: string; - command: string; - args: string[]; - /** Absolute host path to the MCP server source — only when there was a source. */ - hostDir?: string; -} +export type ResolvedExecutorMcpServer = + | { + kind: 'sourced'; + name: string; + command: string; + args: string[]; + /** Absolute host path to the MCP server source. */ + hostDir: string; + } + | { + kind: 'sourceless'; + name: string; + command: string; + args: string[]; + }; export interface SecretConfig { /** Raw value or "$ENV_VAR" reference resolved from host environment. */ From 43ffabd02c6943d47026cf452bbe7cf69b41a141 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Thu, 21 May 2026 17:16:52 +1200 Subject: [PATCH 4/5] refactor(executor): split MCP payload upload off from getSourceArchive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per review: getSourceArchive / uploadDirToSandbox were quietly carrying two contracts via the includeAll flag — SDK source upload (strips node_modules, build outputs, large binaries) vs runnable MCP server payload (verbatim archive). Same function, two semantically different jobs. Confusing. - Revert the includeAll axis on getSourceArchive and uploadDirToSandbox; both are now strictly for SDK source uploads (and plugin installs). - Move MCP payload upload into a dedicated function pair in sandbox/mcp.ts: tarMcpServerSource (verbatim tar, separate process-scoped cache) and uploadMcpServerPayload (tar + upload + extract). MCP module owns its own mechanics rather than reaching into scaffolding's contract. - Add a focused unit test for substituteMcpRoot. - Adapter tests now mock uploadMcpServerSources at the import boundary (rather than the underlying payload uploader) since vitest partial mocks don't intercept intra-module references. Substitution behaviour is covered by the new mcp.test.ts; orchestration logic is covered at the adapter-test boundary. Type-check clean; 372/372 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agents/__tests__/claude.test.ts | 28 ++++++++--- src/agents/__tests__/codex.test.ts | 26 +++++++--- src/sandbox/__tests__/mcp.test.ts | 27 ++++++++++ src/sandbox/mcp.ts | 76 ++++++++++++++++++++++++++--- src/sandbox/scaffolding.ts | 43 ++++++---------- 5 files changed, 154 insertions(+), 46 deletions(-) create mode 100644 src/sandbox/__tests__/mcp.test.ts diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index 8a7eed5..df09e71 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -2,6 +2,7 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; import { access } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../../sandbox/mcp.js'; import { ClaudeAdapter } from '../claude.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; @@ -19,10 +20,15 @@ vi.mock('../../sandbox/scaffolding.js', () => ({ uploadDirToSandbox: vi.fn(), })); +vi.mock('../../sandbox/mcp.js', () => ({ + uploadMcpServerSources: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); const mockAccess = vi.mocked(access); const mockUploadDir = vi.mocked(uploadDirToSandbox); +const mockUploadMcpSources = vi.mocked(uploadMcpServerSources); describe('ClaudeAdapter', () => { let adapter: ClaudeAdapter; @@ -195,26 +201,34 @@ describe('ClaudeAdapter', () => { expect(client.runCommand).not.toHaveBeenCalled(); }); - it('uploads sourced servers, substitutes ${MCP_ROOT}, and writes mcp-config.json', async () => { + it('delegates uploads to uploadMcpServerSources and renders its result into mcp-config.json', async () => { const client = makeMockSandboxClient(); client.runCommand.mockResolvedValue({ stdout: '/root', stderr: '', exitCode: 0 }); + mockUploadMcpSources.mockResolvedValue([ + { name: 'mine', command: 'node', args: ['/root/.mcp-servers/mine/server.js', '--flag'] }, + { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); await adapter.installMcpServersInSandbox(client as any, [ { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js', '--flag'], hostDir: '/tmp/mine' }, { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, ]); - // Sourced server is uploaded outside /workspace, verbatim (incl. node_modules). - expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/mine', '/root/.mcp-servers/mine', 'mcp_mine', { includeAll: true }); - // Sourceless server is not uploaded. - expect(mockUploadDir).toHaveBeenCalledTimes(1); + // Adapter passes the right mcpRoot + server list to the dedicated MCP upload helper. + expect(mockUploadMcpSources).toHaveBeenCalledTimes(1); + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.mcp-servers', [ + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js', '--flag'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + // Source-only uploads (uploadDirToSandbox) are not used for MCP payloads. + expect(mockUploadDir).not.toHaveBeenCalled(); - // The base64-decoded MCP config JSON should reflect the substitution. + // The base64-decoded MCP config JSON reflects the resolved args returned by the helper. const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('base64 -d')); expect(writeCall).toBeDefined(); const b64 = String(writeCall![0]).match(/printf %s '([A-Za-z0-9+/=]+)'/)![1]; const cfg = JSON.parse(Buffer.from(b64, 'base64').toString('utf-8')); - expect(cfg.mcpServers.mine.args).toEqual(['/root/.mcp-servers/mine/server.js', '--flag']); + expect(cfg.mcpServers.mine).toEqual({ command: 'node', args: ['/root/.mcp-servers/mine/server.js', '--flag'] }); expect(cfg.mcpServers.fs).toEqual({ command: 'npx', args: ['-y', 'server-filesystem'] }); // sandboxCommand emits --mcp-config, not --strict-mcp-config. diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index c3e7b78..9e5f4ce 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -2,6 +2,7 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../../sandbox/mcp.js'; import { CodexAdapter } from '../codex.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; @@ -24,6 +25,10 @@ vi.mock('../../sandbox/scaffolding.js', () => ({ uploadDirToSandbox: vi.fn(), })); +vi.mock('../../sandbox/mcp.js', () => ({ + uploadMcpServerSources: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); const mockWriteFile = vi.mocked(writeFile); @@ -33,6 +38,7 @@ const mockAccess = vi.mocked(access); const mockReaddir = vi.mocked(readdir); const mockStat = vi.mocked(stat); const mockUploadDir = vi.mocked(uploadDirToSandbox); +const mockUploadMcpSources = vi.mocked(uploadMcpServerSources); describe('CodexAdapter', () => { let adapter: CodexAdapter; @@ -238,21 +244,29 @@ describe('CodexAdapter', () => { expect(client.runCommand).not.toHaveBeenCalled(); }); - it('uploads sourced servers, substitutes ${MCP_ROOT}, and appends config.toml blocks', async () => { + it('delegates uploads to uploadMcpServerSources and renders its result into config.toml blocks', async () => { const client = makeMockSandboxClient(); client.runCommand.mockResolvedValue({ stdout: '/root/.codex', stderr: '', exitCode: 0 }); + mockUploadMcpSources.mockResolvedValue([ + { name: 'mine', command: 'node', args: ['/root/.codex/.mcp-servers/mine/server.js'] }, + { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); await adapter.installMcpServersInSandbox(client as any, [ { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], hostDir: '/tmp/mine' }, { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, ]); - expect(mockUploadDir).toHaveBeenCalledWith( - client, '/tmp/mine', '/root/.codex/.mcp-servers/mine', 'mcp_mine', { includeAll: true }, - ); - expect(mockUploadDir).toHaveBeenCalledTimes(1); + // Adapter passes the right mcpRoot + server list to the dedicated MCP upload helper. + expect(mockUploadMcpSources).toHaveBeenCalledTimes(1); + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.codex/.mcp-servers', [ + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + // Source-only uploads (uploadDirToSandbox) are not used for MCP payloads. + expect(mockUploadDir).not.toHaveBeenCalled(); - // The base64-decoded TOML appended to config.toml reflects the substitution. + // The base64-decoded TOML appended to config.toml reflects the resolved args returned by the helper. const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('config.toml')); expect(writeCall).toBeDefined(); const b64 = String(writeCall![0]).match(/printf %s '([A-Za-z0-9+/=]+)'/)![1]; diff --git a/src/sandbox/__tests__/mcp.test.ts b/src/sandbox/__tests__/mcp.test.ts new file mode 100644 index 0000000..ce68afd --- /dev/null +++ b/src/sandbox/__tests__/mcp.test.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from 'vitest'; +import { MCP_ROOT_PLACEHOLDER, substituteMcpRoot } from '../mcp.js'; + +describe('substituteMcpRoot', () => { + it('replaces every occurrence of the placeholder in each arg', () => { + expect(substituteMcpRoot( + ['${MCP_ROOT}/index.js', '--config', '${MCP_ROOT}/config.json'], + '/sandbox/.mcp-servers/mine', + )).toEqual([ + '/sandbox/.mcp-servers/mine/index.js', + '--config', + '/sandbox/.mcp-servers/mine/config.json', + ]); + }); + + it('leaves args without the placeholder untouched', () => { + expect(substituteMcpRoot(['-y', 'server-filesystem'], '/anywhere')).toEqual(['-y', 'server-filesystem']); + }); + + it('replaces multiple placeholders within a single arg', () => { + expect(substituteMcpRoot(['${MCP_ROOT}/a:${MCP_ROOT}/b'], '/x')).toEqual(['/x/a:/x/b']); + }); + + it('exports the placeholder literal that validators and adapters can share', () => { + expect(MCP_ROOT_PLACEHOLDER).toBe('${MCP_ROOT}'); + }); +}); diff --git a/src/sandbox/mcp.ts b/src/sandbox/mcp.ts index 1b6abe6..b30d74b 100644 --- a/src/sandbox/mcp.ts +++ b/src/sandbox/mcp.ts @@ -1,6 +1,9 @@ +import { readFile, stat as fsStat } from 'node:fs/promises'; +import { execFile } from 'node:child_process'; +import { basename, join } from 'node:path'; +import { tmpdir } from 'node:os'; import type { ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from './microsandbox.js'; -import { uploadDirToSandbox } from './scaffolding.js'; export const MCP_ROOT_PLACEHOLDER = '${MCP_ROOT}'; @@ -8,6 +11,71 @@ export function substituteMcpRoot(args: string[], destDir: string): string[] { return args.map((a) => a.split(MCP_ROOT_PLACEHOLDER).join(destDir)); } +/** + * Process-scoped cache: MCP server source path → verbatim-tarball path on disk. + * Kept separate from the SDK-source archive cache in `scaffolding.ts` because + * MCP server payloads are runnable artifacts (must include `node_modules` and + * any build output), while SDK-source archives strip those for code review. + */ +const mcpServerArchiveCache = new Map(); + +/** Create a verbatim tar.gz archive of an MCP server source directory. */ +async function tarMcpServerSource(srcPath: string): Promise { + const cached = mcpServerArchiveCache.get(srcPath); + if (cached) { + try { + await fsStat(cached); + return cached; + } catch { + mcpServerArchiveCache.delete(srcPath); + } + } + + const dirName = basename(srcPath); + const tarPath = join(tmpdir(), `agentic-mcp-server-${dirName}-${Date.now()}.tar.gz`); + + await new Promise((resolve, reject) => { + execFile('tar', ['czf', tarPath, '-C', srcPath, '.'], { + timeout: 300_000, + maxBuffer: 50 * 1024 * 1024, + }, (error) => { + if (error) reject(new Error(`Failed to archive MCP server source at ${srcPath}: ${error.message}`)); + else resolve(); + }); + }); + + mcpServerArchiveCache.set(srcPath, tarPath); + return tarPath; +} + +/** + * Tar an MCP server source directory on the host, upload it to the sandbox, + * and extract it into `sandboxDestDir`. Separate from `uploadDirToSandbox` + * because MCP servers are runnable artifacts: the archive must include + * `node_modules` (and any other "bloat" the SDK-source path strips) or the + * server crashes on spawn with ERR_MODULE_NOT_FOUND. + */ +export async function uploadMcpServerPayload( + client: MicrosandboxClient, + hostDir: string, + sandboxDestDir: string, + serverName: string, +): Promise { + const tarPath = await tarMcpServerSource(hostDir); + const tarData = await readFile(tarPath); + const sandboxTarPath = `/tmp/_mcp_${serverName}.tar.gz`; + await client.uploadBinaryFile(sandboxTarPath, tarData); + const result = await client.runCommand( + `mkdir -p '${sandboxDestDir}' && tar xzf '${sandboxTarPath}' -C '${sandboxDestDir}' && rm -f '${sandboxTarPath}'`, + ); + if (result.exitCode !== 0) { + throw new Error( + `Failed to extract MCP server '${serverName}' into sandbox at '${sandboxDestDir}': ` + + `${result.stderr || result.stdout}`, + ); + } +} + export interface ResolvedMcpServerArgs { name: string; command: string; @@ -19,10 +87,6 @@ export interface ResolvedMcpServerArgs { * args (with `${MCP_ROOT}` substituted for the sandbox install dir). Sourceless * entries pass through with their original args. Each adapter then writes its * CLI-specific MCP config from the result. - * - * `{ includeAll: true }` on the upload is load-bearing — an MCP server is a - * runnable artifact and needs its `node_modules`, which the default - * source-archive exclusion strips. */ export async function uploadMcpServerSources( client: MicrosandboxClient, @@ -34,7 +98,7 @@ export async function uploadMcpServerSources( return { name: server.name, command: server.command, args: server.args }; } const destDir = `${mcpRoot}/${server.name}`; - await uploadDirToSandbox(client, server.hostDir, destDir, `mcp_${server.name}`, { includeAll: true }); + await uploadMcpServerPayload(client, server.hostDir, destDir, server.name); return { name: server.name, command: server.command, diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index d33e03f..4e955e5 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -82,37 +82,29 @@ async function readDirRecursive( } /** - * Create a tar.gz archive of a directory. + * Create a tar.gz archive of an SDK *source* directory. Common bloat dirs + * (`node_modules`, `.git`, build outputs, …) and large binaries are stripped + * because the target is code review, not runtime use. For runnable artifacts + * (e.g. MCP server payloads) see `uploadMcpServerPayload` in `./mcp.ts`, which + * archives the tree verbatim instead. * - * By default, common bloat dirs (`node_modules`, `.git`, build outputs, …) are - * excluded — correct for SDK *source* uploads. Pass `{ includeAll: true }` to - * archive the tree verbatim: required for runnable artifacts like MCP server - * payloads, which need their `node_modules` to actually start. - * - * Archives are cached on disk so concurrent sandboxes reuse the same tarball; - * the cache key includes the `includeAll` flag so source/full archives of the - * same path don't collide. + * Archives are cached on disk so concurrent sandboxes reuse the same tarball. */ -export async function getSourceArchive( - srcPath: string, - opts?: { includeAll?: boolean }, -): Promise { - const includeAll = opts?.includeAll ?? false; - const cacheKey = `${srcPath}::${includeAll ? 'all' : 'src'}`; - const cached = sourceArchiveCache.get(cacheKey); +export async function getSourceArchive(srcPath: string): Promise { + const cached = sourceArchiveCache.get(srcPath); if (cached) { try { await fsStat(cached); return cached; } catch { - sourceArchiveCache.delete(cacheKey); + sourceArchiveCache.delete(srcPath); } } const dirName = basename(srcPath); const tarPath = join(tmpdir(), `agentic-sources-${dirName}-${Date.now()}.tar.gz`); - const excludeArgs = includeAll ? [] : [ + const excludeArgs = [ ...EXCLUDED_DIRS.flatMap((d) => ['--exclude', d]), ...EXCLUDED_EXTENSIONS.flatMap((ext) => ['--exclude', ext]), ]; @@ -127,29 +119,26 @@ export async function getSourceArchive( }); }); - sourceArchiveCache.set(cacheKey, tarPath); + sourceArchiveCache.set(srcPath, tarPath); return tarPath; } /** - * Tar a host directory, upload the archive to the sandbox, and extract it - * into `sandboxDestDir`. Used for any "copy this directory tree to a known - * path inside the VM" operation (source uploads, plugin installs). + * Tar an SDK *source* directory on the host, upload the archive, and extract + * it into `sandboxDestDir`. Used for source uploads and plugin installs — + * anything whose target is code review rather than runtime execution. For + * runnable artifacts see `uploadMcpServerPayload` in `./mcp.ts`. * * `archiveLabel` should be a slug-safe identifier (used only to name the * temporary tarball path inside the sandbox). - * - * Pass `{ includeAll: true }` to upload the tree verbatim (no `node_modules` - * exclusion) — required for runnable artifacts such as MCP server payloads. */ export async function uploadDirToSandbox( client: MicrosandboxClient, hostDir: string, sandboxDestDir: string, archiveLabel: string, - opts?: { includeAll?: boolean }, ): Promise { - const tarPath = await getSourceArchive(hostDir, opts); + const tarPath = await getSourceArchive(hostDir); const tarData = await readFile(tarPath); const sandboxTarPath = `/tmp/_${archiveLabel}.tar.gz`; await client.uploadBinaryFile(sandboxTarPath, tarData); From fc6689f62f69d6c36e7c9f3386003f6952fb9bb4 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 22 May 2026 06:09:34 +1200 Subject: [PATCH 5/5] fix(executor): normalise $HOME=/ in MCP install path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smoke-testing against a node:20-slim microsandbox VM showed both MCP install paths producing //.mcp-servers/... and //.codex/.mcp-servers/... — because $HOME=/ in that image, which slips past the ${HOME:-/root} shell fallback (HOME is set, just degenerate). Catch it on the Node side: collapse repeated slashes, then treat a bare / (claude) or / or /.codex (codex, after the inner :- fallback appends /.codex to /) as "no real home" and fall back to /root or /root/.codex respectively. Linux collapses // at the syscall layer so installs technically still worked, but the paths showed up verbatim in mcp-config.json args and config.toml [mcp_servers.*] blocks, which is ugly and could confuse the MCP client. Scoped to the MCP install path only — the equivalent $HOME probe in the plugin install path (shipped via PR #3) has the same flaw and should be fixed in a follow-up. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/agents/__tests__/claude.test.ts | 21 +++++++++++++++++++++ src/agents/__tests__/codex.test.ts | 24 ++++++++++++++++++++++++ src/agents/claude.ts | 6 +++++- src/agents/codex.ts | 8 +++++++- 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index df09e71..97e0b81 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -237,6 +237,27 @@ describe('ClaudeAdapter', () => { expect(cmd).not.toContain('--strict-mcp-config'); }); + it('falls back to /root when $HOME degenerates to /', async () => { + const client = makeMockSandboxClient(); + // First runCommand call is the $HOME probe; later calls are the config write. + client.runCommand.mockImplementation(async (cmd: string) => + cmd.includes('${HOME') + ? { stdout: '/', stderr: '', exitCode: 0 } + : { stdout: '', stderr: '', exitCode: 0 }, + ); + mockUploadMcpSources.mockResolvedValue([]); + + await adapter.installMcpServersInSandbox(client as any, [ + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + // Adapter should treat $HOME=/ as degenerate and use /root, not //.mcp-servers. + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.mcp-servers', expect.any(Array)); + const cmd = adapter.sandboxCommand('go'); + expect(cmd).toContain("--mcp-config '/root/.mcp-servers/mcp-config.json'"); + expect(cmd).not.toContain('//.mcp-servers'); + }); + it('sandboxCommand omits --mcp-config when no MCP servers were installed', () => { const fresh = new ClaudeAdapter({ command: 'claude' }); expect(fresh.sandboxCommand('go')).not.toContain('--mcp-config'); diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index 9e5f4ce..51c331e 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -279,5 +279,29 @@ describe('CodexAdapter', () => { // Append, not clobber. expect(String(writeCall![0])).toContain('>>'); }); + + it('falls back to /root/.codex when $HOME=/ leaks through as //.codex', async () => { + const client = makeMockSandboxClient(); + // What `printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"` produces when + // $HOME=/ and $CODEX_HOME is unset: literal "/" + literal "/.codex". + client.runCommand.mockImplementation(async (cmd: string) => + cmd.includes('CODEX_HOME') + ? { stdout: '//.codex', stderr: '', exitCode: 0 } + : { stdout: '', stderr: '', exitCode: 0 }, + ); + mockUploadMcpSources.mockResolvedValue([]); + + await adapter.installMcpServersInSandbox(client as any, [ + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + // Adapter should recognise /.codex as the $HOME=/ degenerate case and rewrite to /root/.codex. + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.codex/.mcp-servers', expect.any(Array)); + // The config write should target /root/.codex/config.toml, not /.codex/config.toml. + const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('config.toml')); + expect(writeCall).toBeDefined(); + expect(String(writeCall![0])).toContain("'/root/.codex/config.toml'"); + expect(String(writeCall![0])).not.toContain("'/.codex/config.toml'"); + }); }); }); diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 2fea940..2091af0 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -173,7 +173,11 @@ export class ClaudeAdapter extends BaseAdapter { if (servers.length === 0) return; const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); - const home = homeResult.stdout.trim() || '/root'; + // $HOME=/ slips past the ${HOME:-/root} shell fallback (it's set, just + // degenerate). Collapse repeated slashes so e.g. $HOME=/// also normalises, + // then treat a bare / as "no real home" and fall back to /root. + const normalizedHome = homeResult.stdout.trim().replace(/\/+/g, '/'); + const home = !normalizedHome || normalizedHome === '/' ? '/root' : normalizedHome; const mcpRoot = `${home}/.mcp-servers`; const resolved = await uploadMcpServerSources(client, mcpRoot, servers); diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 0d80c4c..484664b 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -209,7 +209,13 @@ export class CodexAdapter extends BaseAdapter { if (servers.length === 0) return; const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); - const codexHome = homeResult.stdout.trim() || '/root/.codex'; + // $HOME=/ slips past the inner ${HOME:-/root} shell fallback and produces + // "//.codex" (literal "/" + literal "/.codex"). Collapse repeated slashes + // and then treat / or /.codex as "no real home" and fall back to /root/.codex. + const normalized = homeResult.stdout.trim().replace(/\/+/g, '/'); + const codexHome = !normalized || normalized === '/' || normalized === '/.codex' + ? '/root/.codex' + : normalized; const mcpRoot = `${codexHome}/.mcp-servers`; const resolved = await uploadMcpServerSources(client, mcpRoot, servers);