diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 38735a0..5e218dc 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -10,6 +10,7 @@ | `targets` | `TargetConfig[]` | **Yes** | Non-empty array. Docker images for sandboxed execution. | | `workspace` | `WorkspaceConfig` | No | Workspace template and setup. | | `executorPlugins` | `ExecutorPlugin[]` | No | Plugin directories installed into the executor's agent CLI inside the sandbox (Claude marketplace, Codex skills, Gemini extensions). Not installed in the judge sandbox — that's intentional, so the judge stays independent of the executor's tooling. | +| `executorMcpServers` | `ExecutorMcpServer[]` | No | MCP servers wired into the executor's agent CLI inside the sandbox (Claude `--mcp-config`, Codex `config.toml`). A separate mechanism from `executorPlugins` — a plugin's bundled MCP servers are not loaded. Not installed in the judge sandbox. | | `sandbox` | `SandboxConfig` | **Yes** | Must be an object (can be `{}`). Resource limits, secrets, env vars. | ## SourceConfig (discriminated union on `type`) @@ -186,6 +187,63 @@ What an adapter requires inside the plugin directory: Each adapter fails fast at install time if its required file is missing — the A/B comparison won't silently no-op. +## ExecutorMcpServer (discriminated union on optional `type`) + +An MCP server wired into the executor's agent CLI. Independent of +`ExecutorPlugin` — MCP servers are wired through each CLI's native MCP-config +surface, NOT through agent plugins (a Claude plugin's `mcpServers` are not +loaded by `claude --plugin-dir`). Installed **only** in the executor sandbox; +the judge sandbox is kept MCP-free. + +Every entry shares three fields: + +| Field | Type | Required | +|-------|------|----------| +| `name` | `string` | Yes — server slug (letters/digits/`.`/`_`/`-`), unique across the array. Used as the MCP server name and install dir name. | +| `command` | `string` | Yes — executable that launches the MCP server (e.g. `node`, `npx`). | +| `args` | `string[]` | Yes — args to `command`. May contain the literal `${MCP_ROOT}` placeholder when a source is present. | + +The optional `type` discriminator selects the source shape: + +### CommandExecutorMcpServer (no `type`) + +No source — `command`/`args` are used as-is (e.g. a server launched via `npx`). +`${MCP_ROOT}` must NOT appear in `args` (there is nothing to substitute) — the +config validator rejects it. + +### LocalExecutorMcpServer (`type: "local"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"local"` | Yes | +| `path` | `string` | Yes — host directory tree uploaded into the sandbox | + +### GitExecutorMcpServer (`type: "git"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"git"` | Yes | +| `url` | `string` | Yes — git repository URL | +| `branch` | `string` | No | +| `subpath` | `string` | No — path within the repo to the server source | +| `sparse` | `string[]` | No — sparse checkout paths | + +### `${MCP_ROOT}` substitution + +For sourced servers (`local`/`git`), the source directory is uploaded into the +sandbox and the literal string `${MCP_ROOT}` in each `args` entry is replaced +with that server's absolute sandbox install dir at install time. Sourceless +servers cannot use the placeholder. + +### Per-adapter wiring + +| Adapter | Sandbox destination | Wiring | +|---|---|---| +| `claude` | sourced servers extracted to `$HOME/.mcp-servers//` (outside `/workspace`) | combined `$HOME/.mcp-servers/mcp-config.json` passed via the `--mcp-config` flag (no `--strict-mcp-config`) | +| `codex` | sourced servers extracted to `$CODEX_HOME/.mcp-servers//` | `[mcp_servers.]` block appended to `$CODEX_HOME/config.toml` (auto-read; existing content preserved) | +| `gemini` | — | Not supported in non-interactive mode. Adapter throws a clear error if `executorMcpServers` is non-empty. | +| custom | — | Not supported. Adapter throws a clear error if `executorMcpServers` is non-empty. | + ## Validation Rules 1. Root must be a JSON object @@ -198,6 +256,7 @@ A/B comparison won't silently no-op. 8. Custom agents must provide `envVar` and `baseUrl` in their secret 9. `baseUrl` must be a parseable URL 10. `executorPlugins`, if present, must be an array; each entry needs a `name` (slug-safe) and a valid `type` (`local` or `git`); names must be unique +11. `executorMcpServers`, if present, must be an array; each entry needs a slug-safe unique `name`, a string `command`, and a `string[]` `args`; `type` is optional but when present must be `local` (needs `path`) or `git` (needs `url`); when `type` is absent, `${MCP_ROOT}` must not appear in `args` ## Minimal Examples diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index 3d51a8a..97e0b81 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -2,6 +2,7 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; import { access } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../../sandbox/mcp.js'; import { ClaudeAdapter } from '../claude.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; @@ -19,10 +20,15 @@ vi.mock('../../sandbox/scaffolding.js', () => ({ uploadDirToSandbox: vi.fn(), })); +vi.mock('../../sandbox/mcp.js', () => ({ + uploadMcpServerSources: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); const mockAccess = vi.mocked(access); const mockUploadDir = vi.mocked(uploadDirToSandbox); +const mockUploadMcpSources = vi.mocked(uploadMcpServerSources); describe('ClaudeAdapter', () => { let adapter: ClaudeAdapter; @@ -187,4 +193,74 @@ describe('ClaudeAdapter', () => { expect(cmd).not.toContain('--plugin-dir'); }); }); + + describe('installMcpServersInSandbox', () => { + it('is a no-op when given an empty server list', async () => { + const client = makeMockSandboxClient(); + await adapter.installMcpServersInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('delegates uploads to uploadMcpServerSources and renders its result into mcp-config.json', async () => { + const client = makeMockSandboxClient(); + client.runCommand.mockResolvedValue({ stdout: '/root', stderr: '', exitCode: 0 }); + mockUploadMcpSources.mockResolvedValue([ + { name: 'mine', command: 'node', args: ['/root/.mcp-servers/mine/server.js', '--flag'] }, + { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + await adapter.installMcpServersInSandbox(client as any, [ + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js', '--flag'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + // Adapter passes the right mcpRoot + server list to the dedicated MCP upload helper. + expect(mockUploadMcpSources).toHaveBeenCalledTimes(1); + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.mcp-servers', [ + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js', '--flag'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + // Source-only uploads (uploadDirToSandbox) are not used for MCP payloads. + expect(mockUploadDir).not.toHaveBeenCalled(); + + // The base64-decoded MCP config JSON reflects the resolved args returned by the helper. + const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('base64 -d')); + expect(writeCall).toBeDefined(); + const b64 = String(writeCall![0]).match(/printf %s '([A-Za-z0-9+/=]+)'/)![1]; + const cfg = JSON.parse(Buffer.from(b64, 'base64').toString('utf-8')); + expect(cfg.mcpServers.mine).toEqual({ command: 'node', args: ['/root/.mcp-servers/mine/server.js', '--flag'] }); + expect(cfg.mcpServers.fs).toEqual({ command: 'npx', args: ['-y', 'server-filesystem'] }); + + // sandboxCommand emits --mcp-config, not --strict-mcp-config. + const cmd = adapter.sandboxCommand('do the thing'); + expect(cmd).toContain("--mcp-config '/root/.mcp-servers/mcp-config.json'"); + expect(cmd).not.toContain('--strict-mcp-config'); + }); + + it('falls back to /root when $HOME degenerates to /', async () => { + const client = makeMockSandboxClient(); + // First runCommand call is the $HOME probe; later calls are the config write. + client.runCommand.mockImplementation(async (cmd: string) => + cmd.includes('${HOME') + ? { stdout: '/', stderr: '', exitCode: 0 } + : { stdout: '', stderr: '', exitCode: 0 }, + ); + mockUploadMcpSources.mockResolvedValue([]); + + await adapter.installMcpServersInSandbox(client as any, [ + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + // Adapter should treat $HOME=/ as degenerate and use /root, not //.mcp-servers. + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.mcp-servers', expect.any(Array)); + const cmd = adapter.sandboxCommand('go'); + expect(cmd).toContain("--mcp-config '/root/.mcp-servers/mcp-config.json'"); + expect(cmd).not.toContain('//.mcp-servers'); + }); + + it('sandboxCommand omits --mcp-config when no MCP servers were installed', () => { + const fresh = new ClaudeAdapter({ command: 'claude' }); + expect(fresh.sandboxCommand('go')).not.toContain('--mcp-config'); + }); + }); }); diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index b42b4f6..51c331e 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -2,6 +2,7 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../../sandbox/mcp.js'; import { CodexAdapter } from '../codex.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; @@ -24,6 +25,10 @@ vi.mock('../../sandbox/scaffolding.js', () => ({ uploadDirToSandbox: vi.fn(), })); +vi.mock('../../sandbox/mcp.js', () => ({ + uploadMcpServerSources: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); const mockWriteFile = vi.mocked(writeFile); @@ -33,6 +38,7 @@ const mockAccess = vi.mocked(access); const mockReaddir = vi.mocked(readdir); const mockStat = vi.mocked(stat); const mockUploadDir = vi.mocked(uploadDirToSandbox); +const mockUploadMcpSources = vi.mocked(uploadMcpServerSources); describe('CodexAdapter', () => { let adapter: CodexAdapter; @@ -230,4 +236,72 @@ describe('CodexAdapter', () => { ])).rejects.toThrow(/'plugin-a'.*'plugin-b'/); }); }); + + describe('installMcpServersInSandbox', () => { + it('is a no-op when given an empty server list', async () => { + const client = makeMockSandboxClient(); + await adapter.installMcpServersInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('delegates uploads to uploadMcpServerSources and renders its result into config.toml blocks', async () => { + const client = makeMockSandboxClient(); + client.runCommand.mockResolvedValue({ stdout: '/root/.codex', stderr: '', exitCode: 0 }); + mockUploadMcpSources.mockResolvedValue([ + { name: 'mine', command: 'node', args: ['/root/.codex/.mcp-servers/mine/server.js'] }, + { name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + await adapter.installMcpServersInSandbox(client as any, [ + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + // Adapter passes the right mcpRoot + server list to the dedicated MCP upload helper. + expect(mockUploadMcpSources).toHaveBeenCalledTimes(1); + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.codex/.mcp-servers', [ + { kind: 'sourced', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], hostDir: '/tmp/mine' }, + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + // Source-only uploads (uploadDirToSandbox) are not used for MCP payloads. + expect(mockUploadDir).not.toHaveBeenCalled(); + + // The base64-decoded TOML appended to config.toml reflects the resolved args returned by the helper. + const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('config.toml')); + expect(writeCall).toBeDefined(); + const b64 = String(writeCall![0]).match(/printf %s '([A-Za-z0-9+/=]+)'/)![1]; + const toml = Buffer.from(b64, 'base64').toString('utf-8'); + expect(toml).toContain('[mcp_servers.mine]'); + expect(toml).toContain('command = "node"'); + expect(toml).toContain('args = ["/root/.codex/.mcp-servers/mine/server.js"]'); + expect(toml).toContain('[mcp_servers.fs]'); + expect(toml).toContain('args = ["-y", "server-filesystem"]'); + // Append, not clobber. + expect(String(writeCall![0])).toContain('>>'); + }); + + it('falls back to /root/.codex when $HOME=/ leaks through as //.codex', async () => { + const client = makeMockSandboxClient(); + // What `printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"` produces when + // $HOME=/ and $CODEX_HOME is unset: literal "/" + literal "/.codex". + client.runCommand.mockImplementation(async (cmd: string) => + cmd.includes('CODEX_HOME') + ? { stdout: '//.codex', stderr: '', exitCode: 0 } + : { stdout: '', stderr: '', exitCode: 0 }, + ); + mockUploadMcpSources.mockResolvedValue([]); + + await adapter.installMcpServersInSandbox(client as any, [ + { kind: 'sourceless', name: 'fs', command: 'npx', args: ['-y', 'server-filesystem'] }, + ]); + + // Adapter should recognise /.codex as the $HOME=/ degenerate case and rewrite to /root/.codex. + expect(mockUploadMcpSources).toHaveBeenCalledWith(client, '/root/.codex/.mcp-servers', expect.any(Array)); + // The config write should target /root/.codex/config.toml, not /.codex/config.toml. + const writeCall = client.runCommand.mock.calls.find((c: any[]) => String(c[0]).includes('config.toml')); + expect(writeCall).toBeDefined(); + expect(String(writeCall![0])).toContain("'/root/.codex/config.toml'"); + expect(String(writeCall![0])).not.toContain("'/.codex/config.toml'"); + }); + }); }); diff --git a/src/agents/adapter.ts b/src/agents/adapter.ts index c74e88c..295f826 100644 --- a/src/agents/adapter.ts +++ b/src/agents/adapter.ts @@ -1,4 +1,4 @@ -import { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { ClaudeAdapter } from './claude.js'; import { CodexAdapter } from './codex.js'; @@ -43,6 +43,16 @@ export interface AgentAdapter { * clear error here instead of silently succeeding. */ installPluginsInSandbox(client: MicrosandboxClient, plugins: ResolvedExecutorPlugin[]): Promise; + + /** + * Install MCP servers into the running sandbox so the agent CLI connects to + * them at startup. Parallel to `installPluginsInSandbox` but a separate + * mechanism — each adapter wires servers through its CLI's native MCP-config + * surface (Claude: `--mcp-config`, Codex: `[mcp_servers.*]` in `config.toml`). + * Adapters whose CLI cannot load MCP servers in non-interactive mode raise a + * clear error here instead of silently succeeding. + */ + installMcpServersInSandbox(client: MicrosandboxClient, servers: ResolvedExecutorMcpServer[]): Promise; } const KNOWN_ADAPTERS: Record AgentAdapter> = { diff --git a/src/agents/base.ts b/src/agents/base.ts index aa141a2..29d51c6 100644 --- a/src/agents/base.ts +++ b/src/agents/base.ts @@ -1,4 +1,4 @@ -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { AgentAdapter } from './adapter.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { spawnAgent, spawnInteractive } from './spawn.js'; @@ -102,6 +102,23 @@ export abstract class BaseAdapter implements AgentAdapter { ); } + /** + * Install MCP servers into the sandbox VM. Subclasses override with + * CLI-specific wiring (Claude: `--mcp-config`, Codex: `config.toml`). + * Default raises a clear error so adapters that don't support MCP servers + * fail loudly when the user wires `executorMcpServers` against them. + */ + async installMcpServersInSandbox( + _client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + throw new Error( + `Agent adapter '${this.name}' does not support executorMcpServers. ` + + `Either remove executorMcpServers from config or switch executor to an adapter that supports MCP servers.`, + ); + } + /** Shared helper: spawn the agent process with piped stdio. */ protected spawn(args: string[], workDir: string, env?: Record, timeout?: number, stdin?: string): Promise { return spawnAgent(this.config.command, args, { cwd: workDir, env, timeout, stdin }); diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 47d77fc..2091af0 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -1,8 +1,9 @@ import { access } from 'node:fs/promises'; import { join } from 'node:path'; -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../sandbox/mcp.js'; import { BaseAdapter } from './base.js'; export class ClaudeAdapter extends BaseAdapter { @@ -19,6 +20,13 @@ export class ClaudeAdapter extends BaseAdapter { */ private installedPluginDirs: string[] = []; + /** + * Sandbox path of the MCP config JSON written by `installMcpServersInSandbox()`. + * `sandboxCommand()` reads this to emit a `--mcp-config ` flag so the + * Claude CLI connects to each MCP server for the run. Null until install. + */ + private installedMcpConfigPath: string | null = null; + constructor(config: AgentConfig) { super(config); } @@ -36,7 +44,13 @@ export class ClaudeAdapter extends BaseAdapter { const pluginFlags = this.installedPluginDirs .map((dir) => ` --plugin-dir '${dir}'`) .join(''); - const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions${pluginFlags} ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); + // --mcp-config points the CLI at a JSON file of MCP servers to connect for + // the session. We intentionally do NOT pass --strict-mcp-config so the + // agent keeps any ambient MCP config alongside our injected servers. + const mcpFlag = this.installedMcpConfigPath + ? ` --mcp-config '${this.installedMcpConfigPath}'` + : ''; + const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions${pluginFlags}${mcpFlag} ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); return cmd; } @@ -143,6 +157,52 @@ export class ClaudeAdapter extends BaseAdapter { })); } + /** + * Install MCP servers for the executor's Claude CLI session. + * + * Sourced servers are uploaded under `$HOME/.mcp-servers//` (outside + * `/workspace` so the agent's workspace stays clean). `${MCP_ROOT}` in each + * server's args is substituted with that server's sandbox install dir. A + * combined MCP config JSON is written to `$HOME/.mcp-servers/mcp-config.json` + * and surfaced to `sandboxCommand()` as a `--mcp-config` flag. + */ + async installMcpServersInSandbox( + client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + // $HOME=/ slips past the ${HOME:-/root} shell fallback (it's set, just + // degenerate). Collapse repeated slashes so e.g. $HOME=/// also normalises, + // then treat a bare / as "no real home" and fall back to /root. + const normalizedHome = homeResult.stdout.trim().replace(/\/+/g, '/'); + const home = !normalizedHome || normalizedHome === '/' ? '/root' : normalizedHome; + const mcpRoot = `${home}/.mcp-servers`; + + const resolved = await uploadMcpServerSources(client, mcpRoot, servers); + + const mcpServers: Record = {}; + for (const server of resolved) { + mcpServers[server.name] = { command: server.command, args: server.args }; + } + + const configJson = JSON.stringify({ mcpServers }, null, 2); + const configPath = `${mcpRoot}/mcp-config.json`; + const encoded = Buffer.from(configJson, 'utf-8').toString('base64'); + const writeResult = await client.runCommand( + `mkdir -p '${mcpRoot}' && printf %s '${encoded}' | base64 -d > '${configPath}'`, + ); + if (writeResult.exitCode !== 0) { + throw new Error( + `Failed to write MCP config into sandbox at '${configPath}': ` + + `${writeResult.stderr || writeResult.stdout}`, + ); + } + + this.installedMcpConfigPath = configPath; + } + protected parseEnvelope(result: AgentResult): AgentResult | null { try { const envelope = JSON.parse(result.stdout); diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 0997e72..484664b 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -1,9 +1,10 @@ import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { join } from 'node:path'; import { tmpdir } from 'node:os'; -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; +import { uploadMcpServerSources } from '../sandbox/mcp.js'; import { BaseAdapter } from './base.js'; export class CodexAdapter extends BaseAdapter { @@ -192,6 +193,51 @@ export class CodexAdapter extends BaseAdapter { )); } + /** + * Install MCP servers for the executor's Codex CLI session. + * + * Sourced servers are uploaded under `$CODEX_HOME/.mcp-servers//`. + * `${MCP_ROOT}` in each server's args is substituted with that server's + * sandbox install dir. An `[mcp_servers.]` TOML block (command + args) + * is appended to `$CODEX_HOME/config.toml`, which Codex auto-reads. Existing + * config content is preserved — blocks are appended, never clobbered. + */ + async installMcpServersInSandbox( + client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + + const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); + // $HOME=/ slips past the inner ${HOME:-/root} shell fallback and produces + // "//.codex" (literal "/" + literal "/.codex"). Collapse repeated slashes + // and then treat / or /.codex as "no real home" and fall back to /root/.codex. + const normalized = homeResult.stdout.trim().replace(/\/+/g, '/'); + const codexHome = !normalized || normalized === '/' || normalized === '/.codex' + ? '/root/.codex' + : normalized; + const mcpRoot = `${codexHome}/.mcp-servers`; + + const resolved = await uploadMcpServerSources(client, mcpRoot, servers); + + const tomlBlocks = resolved.map((server) => + renderMcpServerToml(server.name, server.command, server.args), + ); + + const configPath = `${codexHome}/config.toml`; + const tomlText = `\n${tomlBlocks.join('\n')}\n`; + const encoded = Buffer.from(tomlText, 'utf-8').toString('base64'); + const writeResult = await client.runCommand( + `mkdir -p '${codexHome}' && touch '${configPath}' && printf %s '${encoded}' | base64 -d >> '${configPath}'`, + ); + if (writeResult.exitCode !== 0) { + throw new Error( + `Failed to append MCP servers to Codex config at '${configPath}': ` + + `${writeResult.stderr || writeResult.stdout}`, + ); + } + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.codex/sessions/*.jsonl' -type f 2>/dev/null | sort | tail -1", @@ -216,3 +262,24 @@ export class CodexAdapter extends BaseAdapter { } } } + +/** Encode a JS string as a TOML basic string (double-quoted, escaped). */ +function tomlString(value: string): string { + const escaped = value + .replace(/\\/g, '\\\\') + .replace(/"/g, '\\"') + .replace(/\n/g, '\\n') + .replace(/\t/g, '\\t') + .replace(/\r/g, '\\r'); + return `"${escaped}"`; +} + +/** Render an `[mcp_servers.]` TOML block with command + args. */ +function renderMcpServerToml(name: string, command: string, args: string[]): string { + const argsArray = args.map(tomlString).join(', '); + return [ + `[mcp_servers.${name}]`, + `command = ${tomlString(command)}`, + `args = [${argsArray}]`, + ].join('\n'); +} diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index 03d3dcc..db3bded 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -1,6 +1,6 @@ import { access } from 'node:fs/promises'; import { join } from 'node:path'; -import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin, ResolvedExecutorMcpServer } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; @@ -90,6 +90,23 @@ export class GeminiAdapter extends BaseAdapter { )); } + /** + * Gemini CLI loads MCP servers from `settings.json`, but in non-interactive + * (`--yolo -p`) mode it cannot complete the per-server trust handshake the + * CLI requires, so injected MCP servers would not be reliably connected for + * the run. We fail loudly rather than silently no-op an A/B comparison. + */ + async installMcpServersInSandbox( + _client: MicrosandboxClient, + servers: ResolvedExecutorMcpServer[], + ): Promise { + if (servers.length === 0) return; + throw new Error( + `Agent adapter 'gemini' does not support executorMcpServers in non-interactive mode. ` + + `Remove executorMcpServers from config or switch the executor to an adapter that does (claude, codex).`, + ); + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.gemini/tmp/*/chats/session-*.jsonl' -type f 2>/dev/null | sort | tail -1", diff --git a/src/commands/execute.ts b/src/commands/execute.ts index a327c57..1bef73b 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -5,7 +5,7 @@ import { loadConfig } from '../core/config.js'; import { loadTestSuite, saveResult, saveBinaryResult, formatElapsed } from '../core/suite-io.js'; import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; -import { scaffoldWorkspace, resolveExecutorPlugins } from '../sandbox/scaffolding.js'; +import { scaffoldWorkspace, resolveExecutorPlugins, resolveExecutorMcpServers } from '../sandbox/scaffolding.js'; import { WorkerPool } from '../sandbox/worker-pool.js'; import { createAdapter } from '../agents/adapter.js'; import { getPackageSource, getUrlSources, getFileSources } from '../types.js'; @@ -198,17 +198,38 @@ export async function executeTestCase( } } - // Plugins are intentionally executor-only — the judge sandbox stays - // plugin-free so its scoring is independent of the executor's tooling. + // Plugins and MCP servers are both executor-only — the judge sandbox stays + // free of either so its scoring is independent of the executor's tooling. + // Run the two install blocks concurrently: they write to disjoint sandbox + // dirs (plugins under the CLI's plugin dir; MCP servers under .mcp-servers) + // and update disjoint adapter state. + const installTasks: Promise[] = []; if (config.executorPlugins && config.executorPlugins.length > 0) { - const resolvedPlugins = await resolveExecutorPlugins(config.executorPlugins, paths.cacheRepos); - try { - await adapter.installPluginsInSandbox(client, resolvedPlugins); - } catch (err) { - const message = err instanceof Error ? err.message : String(err); - await saveResult(paths, testCase.id, 'plugin-install-error.log', message, target.name); - throw new Error(`Executor plugin install failed: ${message}`); - } + installTasks.push((async () => { + const resolvedPlugins = await resolveExecutorPlugins(config.executorPlugins, paths.cacheRepos); + try { + await adapter.installPluginsInSandbox(client, resolvedPlugins); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + await saveResult(paths, testCase.id, 'plugin-install-error.log', message, target.name); + throw new Error(`Executor plugin install failed: ${message}`); + } + })()); + } + if (config.executorMcpServers && config.executorMcpServers.length > 0) { + installTasks.push((async () => { + const resolvedMcpServers = await resolveExecutorMcpServers(config.executorMcpServers, paths.cacheRepos); + try { + await adapter.installMcpServersInSandbox(client, resolvedMcpServers); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + await saveResult(paths, testCase.id, 'mcp-server-install-error.log', message, target.name); + throw new Error(`Executor MCP server install failed: ${message}`); + } + })()); + } + if (installTasks.length > 0) { + await Promise.all(installTasks); } // Upload public file sources (docs directories, etc.) for the executor diff --git a/src/core/__tests__/config.test.ts b/src/core/__tests__/config.test.ts index 0efe953..5529099 100644 --- a/src/core/__tests__/config.test.ts +++ b/src/core/__tests__/config.test.ts @@ -325,4 +325,116 @@ describe('loadConfig', () => { await expect(loadConfig('/fake/config.json')).rejects.toThrow(/duplicated/); }); }); + + describe('executorMcpServers', () => { + it('accepts a sourceless server (no type)', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { name: 'fs', command: 'npx', args: ['-y', '@modelcontextprotocol/server-filesystem', '/workspace'] }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorMcpServers).toHaveLength(1); + expect(result.executorMcpServers?.[0]).toMatchObject({ name: 'fs', command: 'npx' }); + }); + + it('accepts a local server with ${MCP_ROOT} in args', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { type: 'local', name: 'mine', command: 'node', args: ['${MCP_ROOT}/server.js'], path: '/tmp/mine' }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorMcpServers).toHaveLength(1); + }); + + it('accepts a git server with branch + subpath', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { type: 'git', name: 'shared', command: 'node', args: ['${MCP_ROOT}/index.js'], + url: 'https://example.com/repo.git', branch: 'main', subpath: 'mcp' }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorMcpServers).toHaveLength(1); + }); + + it('throws when executorMcpServers is not an array', async () => { + const config = { ...validConfig, executorMcpServers: 'not-an-array' }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/executorMcpServers must be an array/); + }); + + it('throws when a server has an invalid name', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'bad name; rm -rf /', command: 'node', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/unsupported characters/); + }); + + it('throws when a server is missing command', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'a', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/command/); + }); + + it('throws when a server has non-array args', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'a', command: 'node', args: 'oops' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/args/); + }); + + it('throws when ${MCP_ROOT} is used without a source', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ name: 'a', command: 'node', args: ['${MCP_ROOT}/server.js'] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/MCP_ROOT/); + }); + + it('throws when a local server is missing path', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ type: 'local', name: 'a', command: 'node', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/path/); + }); + + it('throws when a server has an invalid type', async () => { + const config = { + ...validConfig, + executorMcpServers: [{ type: 'url', name: 'a', command: 'node', args: [] }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/invalid/i); + }); + + it('throws when two servers share the same name', async () => { + const config = { + ...validConfig, + executorMcpServers: [ + { name: 'a', command: 'node', args: [] }, + { name: 'a', command: 'npx', args: [] }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/duplicated/); + }); + }); }); diff --git a/src/core/config.ts b/src/core/config.ts index cb03cc6..3976717 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -1,6 +1,7 @@ import { readFile } from 'node:fs/promises'; import { Config, AgentConfig } from '../types.js'; import { createAdapter } from '../agents/adapter.js'; +import { MCP_ROOT_PLACEHOLDER } from '../sandbox/mcp.js'; export async function loadConfig(configPath: string): Promise { let raw: string; @@ -65,6 +66,61 @@ function validateExecutorPluginEntry(plugin: Record, prefix: st } } +function validateExecutorMcpServerEntry(server: Record, prefix: string): void { + if (!server || typeof server !== 'object' || Array.isArray(server)) { + throw new Error(`${prefix} must be an object`); + } + + if (!server.name || typeof server.name !== 'string') { + throw new Error(`${prefix} requires a non-empty 'name' string`); + } + if (!/^[a-zA-Z0-9._-]+$/.test(server.name)) { + throw new Error( + `${prefix}.name '${server.name}' contains unsupported characters. ` + + `Use only letters, digits, '.', '_', or '-'.`, + ); + } + + if (!server.command || typeof server.command !== 'string') { + throw new Error(`${prefix} requires a non-empty 'command' string`); + } + + if (!Array.isArray(server.args) || !server.args.every((a) => typeof a === 'string')) { + throw new Error(`${prefix} requires 'args' to be an array of strings`); + } + const args = server.args as string[]; + + // `type` is OPTIONAL for MCP servers. When present it must be local|git. + if (server.type !== undefined) { + if (typeof server.type !== 'string' || !VALID_PLUGIN_TYPES.includes(server.type)) { + throw new Error( + `${prefix}.type '${String(server.type)}' is invalid. Must be one of: ` + + `${VALID_PLUGIN_TYPES.map(t => `'${t}'`).join(', ')} (or omitted for a sourceless server)`, + ); + } + switch (server.type) { + case 'local': + if (!server.path || typeof server.path !== 'string') { + throw new Error(`${prefix} type 'local' requires path to be set`); + } + break; + case 'git': + if (!server.url || typeof server.url !== 'string') { + throw new Error(`${prefix} type 'git' requires url to be set`); + } + break; + } + } else { + // Sourceless server — `${MCP_ROOT}` cannot be substituted, so reject it. + if (args.some((a) => a.includes(MCP_ROOT_PLACEHOLDER))) { + throw new Error( + `${prefix} uses the '${MCP_ROOT_PLACEHOLDER}' placeholder but has no source ` + + `('type' is omitted). Add a 'local' or 'git' source, or remove the placeholder.`, + ); + } + } +} + function validateSourceEntry(source: Record, prefix: string): void { if (!source || typeof source !== 'object' || Array.isArray(source)) { throw new Error(`${prefix} must be an object`); @@ -147,6 +203,23 @@ export function validateConfig(data: unknown, configPath?: string): Config { } } + // Validate executorMcpServers (optional) + if (obj.executorMcpServers !== undefined) { + if (!Array.isArray(obj.executorMcpServers)) { + throw new Error('Config executorMcpServers must be an array'); + } + const seenMcpNames = new Set(); + for (let i = 0; i < obj.executorMcpServers.length; i++) { + const entry = obj.executorMcpServers[i] as Record; + validateExecutorMcpServerEntry(entry, `executorMcpServers[${i}]`); + const name = entry.name as string; + if (seenMcpNames.has(name)) { + throw new Error(`executorMcpServers[${i}].name '${name}' is duplicated`); + } + seenMcpNames.add(name); + } + } + // Validate targets if (!Array.isArray(obj.targets) || obj.targets.length === 0) { throw new Error('Config requires at least one target in targets array'); diff --git a/src/sandbox/__tests__/mcp.test.ts b/src/sandbox/__tests__/mcp.test.ts new file mode 100644 index 0000000..ce68afd --- /dev/null +++ b/src/sandbox/__tests__/mcp.test.ts @@ -0,0 +1,27 @@ +import { describe, it, expect } from 'vitest'; +import { MCP_ROOT_PLACEHOLDER, substituteMcpRoot } from '../mcp.js'; + +describe('substituteMcpRoot', () => { + it('replaces every occurrence of the placeholder in each arg', () => { + expect(substituteMcpRoot( + ['${MCP_ROOT}/index.js', '--config', '${MCP_ROOT}/config.json'], + '/sandbox/.mcp-servers/mine', + )).toEqual([ + '/sandbox/.mcp-servers/mine/index.js', + '--config', + '/sandbox/.mcp-servers/mine/config.json', + ]); + }); + + it('leaves args without the placeholder untouched', () => { + expect(substituteMcpRoot(['-y', 'server-filesystem'], '/anywhere')).toEqual(['-y', 'server-filesystem']); + }); + + it('replaces multiple placeholders within a single arg', () => { + expect(substituteMcpRoot(['${MCP_ROOT}/a:${MCP_ROOT}/b'], '/x')).toEqual(['/x/a:/x/b']); + }); + + it('exports the placeholder literal that validators and adapters can share', () => { + expect(MCP_ROOT_PLACEHOLDER).toBe('${MCP_ROOT}'); + }); +}); diff --git a/src/sandbox/mcp.ts b/src/sandbox/mcp.ts new file mode 100644 index 0000000..b30d74b --- /dev/null +++ b/src/sandbox/mcp.ts @@ -0,0 +1,108 @@ +import { readFile, stat as fsStat } from 'node:fs/promises'; +import { execFile } from 'node:child_process'; +import { basename, join } from 'node:path'; +import { tmpdir } from 'node:os'; +import type { ResolvedExecutorMcpServer } from '../types.js'; +import type { MicrosandboxClient } from './microsandbox.js'; + +export const MCP_ROOT_PLACEHOLDER = '${MCP_ROOT}'; + +export function substituteMcpRoot(args: string[], destDir: string): string[] { + return args.map((a) => a.split(MCP_ROOT_PLACEHOLDER).join(destDir)); +} + +/** + * Process-scoped cache: MCP server source path → verbatim-tarball path on disk. + * Kept separate from the SDK-source archive cache in `scaffolding.ts` because + * MCP server payloads are runnable artifacts (must include `node_modules` and + * any build output), while SDK-source archives strip those for code review. + */ +const mcpServerArchiveCache = new Map(); + +/** Create a verbatim tar.gz archive of an MCP server source directory. */ +async function tarMcpServerSource(srcPath: string): Promise { + const cached = mcpServerArchiveCache.get(srcPath); + if (cached) { + try { + await fsStat(cached); + return cached; + } catch { + mcpServerArchiveCache.delete(srcPath); + } + } + + const dirName = basename(srcPath); + const tarPath = join(tmpdir(), `agentic-mcp-server-${dirName}-${Date.now()}.tar.gz`); + + await new Promise((resolve, reject) => { + execFile('tar', ['czf', tarPath, '-C', srcPath, '.'], { + timeout: 300_000, + maxBuffer: 50 * 1024 * 1024, + }, (error) => { + if (error) reject(new Error(`Failed to archive MCP server source at ${srcPath}: ${error.message}`)); + else resolve(); + }); + }); + + mcpServerArchiveCache.set(srcPath, tarPath); + return tarPath; +} + +/** + * Tar an MCP server source directory on the host, upload it to the sandbox, + * and extract it into `sandboxDestDir`. Separate from `uploadDirToSandbox` + * because MCP servers are runnable artifacts: the archive must include + * `node_modules` (and any other "bloat" the SDK-source path strips) or the + * server crashes on spawn with ERR_MODULE_NOT_FOUND. + */ +export async function uploadMcpServerPayload( + client: MicrosandboxClient, + hostDir: string, + sandboxDestDir: string, + serverName: string, +): Promise { + const tarPath = await tarMcpServerSource(hostDir); + const tarData = await readFile(tarPath); + const sandboxTarPath = `/tmp/_mcp_${serverName}.tar.gz`; + await client.uploadBinaryFile(sandboxTarPath, tarData); + const result = await client.runCommand( + `mkdir -p '${sandboxDestDir}' && tar xzf '${sandboxTarPath}' -C '${sandboxDestDir}' && rm -f '${sandboxTarPath}'`, + ); + if (result.exitCode !== 0) { + throw new Error( + `Failed to extract MCP server '${serverName}' into sandbox at '${sandboxDestDir}': ` + + `${result.stderr || result.stdout}`, + ); + } +} + +export interface ResolvedMcpServerArgs { + name: string; + command: string; + args: string[]; +} + +/** + * Upload sourced MCP server payloads in parallel and return per-server resolved + * args (with `${MCP_ROOT}` substituted for the sandbox install dir). Sourceless + * entries pass through with their original args. Each adapter then writes its + * CLI-specific MCP config from the result. + */ +export async function uploadMcpServerSources( + client: MicrosandboxClient, + mcpRoot: string, + servers: ResolvedExecutorMcpServer[], +): Promise { + return Promise.all(servers.map(async (server) => { + if (server.kind === 'sourceless') { + return { name: server.name, command: server.command, args: server.args }; + } + const destDir = `${mcpRoot}/${server.name}`; + await uploadMcpServerPayload(client, server.hostDir, destDir, server.name); + return { + name: server.name, + command: server.command, + args: substituteMcpRoot(server.args, destDir), + }; + })); +} diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index eabb9dc..4e955e5 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -3,7 +3,10 @@ import { execFile } from 'node:child_process'; import { basename, join, relative, resolve as resolvePath } from 'node:path'; import { tmpdir } from 'node:os'; import type { MicrosandboxClient } from './microsandbox.js'; -import type { Config, TestCase, SourceConfig, ExecutorPlugin, ResolvedExecutorPlugin } from '../types.js'; +import type { + Config, TestCase, SourceConfig, ExecutorPlugin, ResolvedExecutorPlugin, + ExecutorMcpServer, ResolvedExecutorMcpServer, +} from '../types.js'; import { resolveSources, resolveSource } from '../core/source-resolver.js'; /** Directories excluded from source uploads — these are large and not useful for evaluation. */ @@ -79,7 +82,12 @@ async function readDirRecursive( } /** - * Create a tar.gz archive of a directory, excluding common bloat dirs. + * Create a tar.gz archive of an SDK *source* directory. Common bloat dirs + * (`node_modules`, `.git`, build outputs, …) and large binaries are stripped + * because the target is code review, not runtime use. For runnable artifacts + * (e.g. MCP server payloads) see `uploadMcpServerPayload` in `./mcp.ts`, which + * archives the tree verbatim instead. + * * Archives are cached on disk so concurrent sandboxes reuse the same tarball. */ export async function getSourceArchive(srcPath: string): Promise { @@ -116,9 +124,10 @@ export async function getSourceArchive(srcPath: string): Promise { } /** - * Tar a host directory, upload the archive to the sandbox, and extract it - * into `sandboxDestDir`. Used for any "copy this directory tree to a known - * path inside the VM" operation (source uploads, plugin installs). + * Tar an SDK *source* directory on the host, upload the archive, and extract + * it into `sandboxDestDir`. Used for source uploads and plugin installs — + * anything whose target is code review rather than runtime execution. For + * runnable artifacts see `uploadMcpServerPayload` in `./mcp.ts`. * * `archiveLabel` should be a slug-safe identifier (used only to name the * temporary tarball path inside the sandbox). @@ -301,3 +310,41 @@ export async function resolveExecutorPlugins( return { name: plugin.name, hostDir: resolvePath(hostDir) }; })); } + +/** + * Resolve `config.executorMcpServers` to host-side directories. Mirrors + * `resolveExecutorPlugins`: entries with a source (local/git) resolve via the + * shared source-resolver (git clones cached under `cacheRepos`); sourceless + * entries (no `type`) carry no `hostDir`. + * + * Returns an empty array when no MCP servers are configured. + */ +export async function resolveExecutorMcpServers( + servers: ExecutorMcpServer[] | undefined, + cacheRepos: string, +): Promise { + if (!servers || servers.length === 0) return []; + + return Promise.all(servers.map(async (server): Promise => { + if (server.type === undefined) { + return { kind: 'sourceless', name: server.name, command: server.command, args: server.args }; + } + const source: SourceConfig = server.type === 'local' + ? { type: 'local', path: server.path } + : { + type: 'git', + url: server.url, + branch: server.branch, + subpath: server.subpath, + sparse: server.sparse, + }; + const hostDir = await resolveSource(source, { reposDir: cacheRepos }); + return { + kind: 'sourced', + name: server.name, + command: server.command, + args: server.args, + hostDir: resolvePath(hostDir), + }; + })); +} diff --git a/src/scoring/__tests__/judge.test.ts b/src/scoring/__tests__/judge.test.ts index 0a3691f..be5eb66 100644 --- a/src/scoring/__tests__/judge.test.ts +++ b/src/scoring/__tests__/judge.test.ts @@ -72,6 +72,7 @@ function makeMockAdapter(opts: { stdout: string }) { extractResult: vi.fn().mockImplementation((stdout: string) => stdout), extractLog: vi.fn().mockResolvedValue(null), installPluginsInSandbox: vi.fn().mockResolvedValue(undefined), + installMcpServersInSandbox: vi.fn().mockResolvedValue(undefined), }; } diff --git a/src/types.ts b/src/types.ts index c6739ff..4543b65 100644 --- a/src/types.ts +++ b/src/types.ts @@ -183,6 +183,80 @@ export interface ResolvedExecutorPlugin { hostDir: string; } +/** + * Source of an MCP server to install into the executor's agent CLI. + * + * Independent of `ExecutorPlugin` — MCP servers are wired through each CLI's + * native MCP-config surface, NOT through agent plugins (a Claude plugin's + * `mcpServers` are not loaded by `claude --plugin-dir`). + * + * Three shapes, discriminated by the optional `type` field: + * - `LocalExecutorMcpServer` (`type:'local'`) — upload a host directory tree. + * - `GitExecutorMcpServer` (`type:'git'`) — clone a git source. + * - `CommandExecutorMcpServer` (no `type`) — sourceless; `command`/`args` + * are used as-is (e.g. an MCP server launched via `npx`). + * + * All three share `name`/`command`/`args`. `args` may contain the literal + * placeholder `${MCP_ROOT}`, substituted at install time with the absolute + * sandbox path of the uploaded source directory — only valid when there IS a + * source. + */ +export interface BaseExecutorMcpServer { + /** MCP server slug — used as the server name in MCP config and the install dir name. */ + name: string; + /** Executable that launches the MCP server (e.g. "node", "npx"). */ + command: string; + /** Args passed to `command`. May contain `${MCP_ROOT}` when a source is present. */ + args: string[]; +} + +export interface LocalExecutorMcpServer extends BaseExecutorMcpServer { + type: 'local'; + /** Absolute or relative path to a host directory tree uploaded into the sandbox. */ + path: string; +} + +export interface GitExecutorMcpServer extends BaseExecutorMcpServer { + type: 'git'; + url: string; + branch?: string; + /** Path within the cloned repo that contains the MCP server source. */ + subpath?: string; + sparse?: string[]; +} + +export interface CommandExecutorMcpServer extends BaseExecutorMcpServer { + type?: undefined; +} + +export type ExecutorMcpServer = + | LocalExecutorMcpServer + | GitExecutorMcpServer + | CommandExecutorMcpServer; + +/** + * An ExecutorMcpServer after host-side resolution. Discriminated on `kind` so + * consumers narrow at compile time instead of probing an optional `hostDir`. + * `sourced` entries were `local`/`git` in the input and carry a host directory + * to upload; `sourceless` entries had no input `type` and run from `command` + * alone (e.g. `npx server-name`). + */ +export type ResolvedExecutorMcpServer = + | { + kind: 'sourced'; + name: string; + command: string; + args: string[]; + /** Absolute host path to the MCP server source. */ + hostDir: string; + } + | { + kind: 'sourceless'; + name: string; + command: string; + args: string[]; + }; + export interface SecretConfig { /** Raw value or "$ENV_VAR" reference resolved from host environment. */ value: string; @@ -223,6 +297,15 @@ export interface Config { * with these so judge scoring stays independent of the executor's tooling. */ executorPlugins?: ExecutorPlugin[]; + /** + * MCP servers to install into the executor agent's CLI inside the sandbox VM. + * Parallel to `executorPlugins` but a separate mechanism — each adapter wires + * these through its CLI's native MCP-config surface (Claude: `--mcp-config`, + * Codex: `[mcp_servers.*]` in `config.toml`). MCP servers are scoped to the + * executor — the judge sandbox is intentionally not seeded with them so judge + * scoring stays independent of the executor's tooling. + */ + executorMcpServers?: ExecutorMcpServer[]; sandbox: SandboxConfig; }