diff --git a/README.md b/README.md index 982bb7e..dbe7149 100644 --- a/README.md +++ b/README.md @@ -441,6 +441,45 @@ Template files and setup scripts for the test workspace: | `template` | Local directory uploaded to `/workspace/` in the sandbox | | `setupScript` | Script file uploaded and executed during scaffolding | +### Executor plugins + +Install Claude Code / Codex / Gemini plugins into the executor sandbox before the agent runs. Useful for A/B-testing whether shipping a plugin (skills, slash commands, marketplace bundles) measurably improves an agent's ability to use your SDK. + +Plugins are installed **only** in the executor sandbox — the judge sandbox stays plugin-free so its scoring is independent of the executor's tooling. Run the same suite twice (once without `executorPlugins`, once with) and compare per-test-case judge scores in the inspect UI. + +```json +{ + "executorPlugins": [ + { "type": "local", "name": "my-sdk-skills", "path": "/abs/path/to/plugin-dir" }, + { + "type": "git", + "name": "shared-skills", + "url": "https://github.com/example/skills.git", + "branch": "main", + "subpath": "plugins/shared-skills" + } + ] +} +``` + +| Field | Description | +|---|---| +| `type` | `"local"` or `"git"` | +| `name` | Plugin slug (letters, digits, `.`, `_`, `-`). Must match the plugin manifest's name and must be unique across `executorPlugins`. | +| `path` | For `type: "local"`. Directory on the host containing the adapter-specific manifest. | +| `url` / `branch` / `subpath` / `sparse` | For `type: "git"`. Same semantics as `GitSource` under `privateInfo`. | + +What each adapter expects inside the plugin directory: + +| Adapter | Required file(s) | Where it lands in the sandbox | +|---|---|---| +| `claude` | `.claude-plugin/plugin.json` | Plugin dir extracted to `$HOME/.claude/plugins//`, then loaded via the documented `--plugin-dir ` CLI flag at each invocation. (Marketplace registration is intentionally skipped — Claude Code's marketplace flow prompts for trust, which can't be answered in `--print` mode.) | +| `codex` | `.codex-plugin/plugin.json` plus one or more `skills//SKILL.md` | Each `skills//` extracted to `$CODEX_HOME/skills//`. Codex auto-discovers skills from that directory. | +| `gemini` | `gemini-extension.json` at the plugin root | The whole plugin dir extracted to `$HOME/.gemini/extensions//`. | +| custom | — | Not supported. The adapter raises a clear error at install time. | + +Adapters fail fast at install time if the required manifest is missing, so an A/B run cannot silently no-op against the wrong CLI. + ### Sandbox Resource limits, secrets, and environment variables for sandbox VMs: diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 276784f..38735a0 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -9,6 +9,7 @@ | `agents` | `object` | No | Per-role agent configuration. | | `targets` | `TargetConfig[]` | **Yes** | Non-empty array. Docker images for sandboxed execution. | | `workspace` | `WorkspaceConfig` | No | Workspace template and setup. | +| `executorPlugins` | `ExecutorPlugin[]` | No | Plugin directories installed into the executor's agent CLI inside the sandbox (Claude marketplace, Codex skills, Gemini extensions). Not installed in the judge sandbox — that's intentional, so the judge stays independent of the executor's tooling. | | `sandbox` | `SandboxConfig` | **Yes** | Must be an object (can be `{}`). Resource limits, secrets, env vars. | ## SourceConfig (discriminated union on `type`) @@ -142,6 +143,49 @@ Custom agents (any command not in the table above) **must** provide `envVar` and | `template` | `string` | No — local directory to copy into sandbox workspace | | `setupScript` | `string` | No — path to script run during workspace setup | +## ExecutorPlugin (discriminated union on `type`) + +A plugin tree installed into the executor's agent CLI. Use these to A/B test +whether shipping skills/plugins to the executor improves judge scores. Plugins +are installed **only** in the executor sandbox; the judge sandbox is kept +plugin-free so its scoring is independent of the executor's tooling. + +Each entry has a `name` (slug — letters/digits/`.`/`_`/`-` only) plus the +discriminator: + +### LocalExecutorPlugin (`type: "local"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"local"` | Yes | +| `name` | `string` | Yes — plugin slug | +| `path` | `string` | Yes — absolute or relative directory on the host | + +### GitExecutorPlugin (`type: "git"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"git"` | Yes | +| `name` | `string` | Yes — plugin slug | +| `url` | `string` | Yes — git repository URL | +| `branch` | `string` | No | +| `subpath` | `string` | No — path within the repo to the plugin dir | +| `sparse` | `string[]` | No — sparse checkout paths | + +### Per-adapter expectations + +What an adapter requires inside the plugin directory: + +| Adapter | Required file(s) | Sandbox destination | +|---|---|---| +| `claude` | `.claude-plugin/plugin.json` at plugin root | Plugin dir extracted to `$HOME/.claude/plugins//`; loaded for each session via the `--plugin-dir ` CLI flag. | +| `codex` | `.codex-plugin/plugin.json` at plugin root, and one or more `skills//SKILL.md` files | Each `skills//` dir extracted to `$CODEX_HOME/skills//` (auto-discovered). | +| `gemini` | `gemini-extension.json` at plugin root | Entire plugin dir extracted to `$HOME/.gemini/extensions//`. | +| custom | — | Not supported. Adapter throws a clear error if `executorPlugins` is non-empty. | + +Each adapter fails fast at install time if its required file is missing — the +A/B comparison won't silently no-op. + ## Validation Rules 1. Root must be a JSON object @@ -153,6 +197,7 @@ Custom agents (any command not in the table above) **must** provide `envVar` and 7. `agents.executor` and `agents.judge` must have `secret.value` (non-empty string) 8. Custom agents must provide `envVar` and `baseUrl` in their secret 9. `baseUrl` must be a parseable URL +10. `executorPlugins`, if present, must be an array; each entry needs a `name` (slug-safe) and a valid `type` (`local` or `git`); names must be unique ## Minimal Examples diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index 7e18993..3d51a8a 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -1,15 +1,28 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { access } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; +import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; import { ClaudeAdapter } from '../claude.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; +import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; vi.mock('../spawn.js', () => ({ spawnAgent: vi.fn(), spawnInteractive: vi.fn(), })); +vi.mock('node:fs/promises', () => ({ + access: vi.fn(), +})); + +vi.mock('../../sandbox/scaffolding.js', () => ({ + uploadDirToSandbox: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); +const mockAccess = vi.mocked(access); +const mockUploadDir = vi.mocked(uploadDirToSandbox); describe('ClaudeAdapter', () => { let adapter: ClaudeAdapter; @@ -131,4 +144,47 @@ describe('ClaudeAdapter', () => { expect(adapter.installCommand).toBe('npm i -g @anthropic-ai/claude-code'); }); }); + + describe('installPluginsInSandbox', () => { + it('is a no-op when given an empty plugin list', async () => { + const client = makeMockSandboxClient(); + await adapter.installPluginsInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + expect(client.uploadFiles).not.toHaveBeenCalled(); + }); + + it('throws clearly when a plugin is missing its Claude manifest', async () => { + mockAccess.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'broken', hostDir: '/tmp/broken' }, + ])).rejects.toThrow(/\.claude-plugin\/plugin\.json/); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('extracts each plugin into /root/.claude/plugins/ and records the paths', async () => { + mockAccess.mockResolvedValue(undefined); + const client = makeMockSandboxClient(); + + await adapter.installPluginsInSandbox(client as any, [ + { name: 'plugin-a', hostDir: '/tmp/a' }, + { name: 'plugin-b', hostDir: '/tmp/b' }, + ]); + + expect(mockUploadDir).toHaveBeenCalledTimes(2); + expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/a', '/root/.claude/plugins/plugin-a', 'plugin_plugin-a'); + expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/b', '/root/.claude/plugins/plugin-b', 'plugin_plugin-b'); + + // sandboxCommand should now emit --plugin-dir for each plugin. + const cmd = adapter.sandboxCommand('do the thing'); + expect(cmd).toContain("--plugin-dir '/root/.claude/plugins/plugin-a'"); + expect(cmd).toContain("--plugin-dir '/root/.claude/plugins/plugin-b'"); + }); + + it('sandboxCommand does not include --plugin-dir flags when no plugins have been installed', () => { + const fresh = new ClaudeAdapter({ command: 'claude' }); + const cmd = fresh.sandboxCommand('do the thing'); + expect(cmd).not.toContain('--plugin-dir'); + }); + }); }); diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index 49b62d3..b42b4f6 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -1,8 +1,10 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; -import { writeFile, readFile, rm } from 'node:fs/promises'; +import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; +import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; import { CodexAdapter } from '../codex.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; +import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; vi.mock('../spawn.js', () => ({ spawnAgent: vi.fn(), @@ -13,6 +15,13 @@ vi.mock('node:fs/promises', () => ({ writeFile: vi.fn().mockResolvedValue(undefined), readFile: vi.fn(), rm: vi.fn().mockResolvedValue(undefined), + access: vi.fn(), + readdir: vi.fn(), + stat: vi.fn(), +})); + +vi.mock('../../sandbox/scaffolding.js', () => ({ + uploadDirToSandbox: vi.fn(), })); const mockSpawnAgent = vi.mocked(spawnAgent); @@ -20,6 +29,10 @@ const mockSpawnInteractive = vi.mocked(spawnInteractive); const mockWriteFile = vi.mocked(writeFile); const mockReadFile = vi.mocked(readFile); const mockRm = vi.mocked(rm); +const mockAccess = vi.mocked(access); +const mockReaddir = vi.mocked(readdir); +const mockStat = vi.mocked(stat); +const mockUploadDir = vi.mocked(uploadDirToSandbox); describe('CodexAdapter', () => { let adapter: CodexAdapter; @@ -120,4 +133,101 @@ describe('CodexAdapter', () => { expect(adapter.installCommand).toBe('npm i -g @openai/codex@0.93.0'); }); }); + + describe('installPluginsInSandbox', () => { + function makeDirent(name: string, isDir: boolean) { + return { + name, + isDirectory: () => isDir, + isFile: () => !isDir, + } as any; + } + + it('is a no-op when given an empty plugin list', async () => { + const client = makeMockSandboxClient(); + await adapter.installPluginsInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('throws when a plugin is missing its Codex manifest', async () => { + mockAccess.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'broken', hostDir: '/tmp/broken' }, + ])).rejects.toThrow(/\.codex-plugin\/plugin\.json/); + expect(mockUploadDir).not.toHaveBeenCalled(); + }); + + it('throws when a plugin has no skills/ directory', async () => { + mockAccess.mockResolvedValueOnce(undefined); + mockReaddir.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'empty', hostDir: '/tmp/empty' }, + ])).rejects.toThrow(/no 'skills\/' directory/); + }); + + it('throws when a plugin contributes no SKILL.md-bearing dirs', async () => { + mockAccess.mockResolvedValueOnce(undefined); + mockReaddir.mockResolvedValueOnce([ + makeDirent('not-a-skill', true), + ]); + mockStat.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'shell', hostDir: '/tmp/shell' }, + ])).rejects.toThrow(/no usable Codex skills/); + }); + + it('extracts each plugin skill into $CODEX_HOME/skills/', async () => { + mockAccess.mockResolvedValue(undefined); + // One plugin with two skills. + mockReaddir.mockResolvedValueOnce([ + makeDirent('skill-one', true), + makeDirent('skill-two', true), + makeDirent('not-a-dir', false), + ]); + mockStat.mockResolvedValue({ isFile: () => true } as any); + + const client = makeMockSandboxClient(); + client.runCommand + .mockResolvedValueOnce({ stdout: '/root/.codex', stderr: '', exitCode: 0 }) // printf CODEX_HOME + .mockResolvedValue({ stdout: '', stderr: '', exitCode: 0 }); // mkdir, etc. + + await adapter.installPluginsInSandbox(client as any, [ + { name: 'bundle', hostDir: '/tmp/bundle' }, + ]); + + expect(mockUploadDir).toHaveBeenCalledTimes(2); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + expect.stringContaining('skills/skill-one'), + '/root/.codex/skills/skill-one', + 'codex_skill_skill-one', + ); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + expect.stringContaining('skills/skill-two'), + '/root/.codex/skills/skill-two', + 'codex_skill_skill-two', + ); + }); + + it('throws when two plugins contribute the same skill name, naming both', async () => { + mockAccess.mockResolvedValue(undefined); + // Two plugins, each contributing a skill called 'shared'. + mockReaddir + .mockResolvedValueOnce([makeDirent('shared', true)]) + .mockResolvedValueOnce([makeDirent('shared', true)]); + mockStat.mockResolvedValue({ isFile: () => true } as any); + + const client = makeMockSandboxClient(); + client.runCommand.mockResolvedValue({ stdout: '/root/.codex', stderr: '', exitCode: 0 }); + + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'plugin-a', hostDir: '/tmp/a' }, + { name: 'plugin-b', hostDir: '/tmp/b' }, + ])).rejects.toThrow(/'plugin-a'.*'plugin-b'/); + }); + }); }); diff --git a/src/agents/__tests__/custom.test.ts b/src/agents/__tests__/custom.test.ts index 51fbba9..214f033 100644 --- a/src/agents/__tests__/custom.test.ts +++ b/src/agents/__tests__/custom.test.ts @@ -182,4 +182,18 @@ describe('CustomAdapter', () => { }); }); + describe('installPluginsInSandbox', () => { + it('is a no-op when given an empty plugin list', async () => { + const adapter = new CustomAdapter({ command: 'my-tool' }); + await expect(adapter.installPluginsInSandbox({} as any, [])).resolves.toBeUndefined(); + }); + + it('throws a clear error when given plugins (custom CLIs have no documented plugin layout)', async () => { + const adapter = new CustomAdapter({ command: 'my-tool' }); + await expect(adapter.installPluginsInSandbox({} as any, [ + { name: 'x', hostDir: '/tmp/x' }, + ])).rejects.toThrow(/does not support executorPlugins/); + }); + }); + }); diff --git a/src/agents/__tests__/gemini.test.ts b/src/agents/__tests__/gemini.test.ts index fa02aee..06393a5 100644 --- a/src/agents/__tests__/gemini.test.ts +++ b/src/agents/__tests__/gemini.test.ts @@ -1,15 +1,28 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { access } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; +import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; import { GeminiAdapter } from '../gemini.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; +import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; vi.mock('../spawn.js', () => ({ spawnAgent: vi.fn(), spawnInteractive: vi.fn(), })); +vi.mock('node:fs/promises', () => ({ + access: vi.fn(), +})); + +vi.mock('../../sandbox/scaffolding.js', () => ({ + uploadDirToSandbox: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); +const mockAccess = vi.mocked(access); +const mockUploadDir = vi.mocked(uploadDirToSandbox); describe('GeminiAdapter', () => { let adapter: GeminiAdapter; @@ -104,4 +117,47 @@ describe('GeminiAdapter', () => { expect(adapter.installCommand).toBe('npm i -g @google/gemini-cli'); }); }); + + describe('installPluginsInSandbox', () => { + it('is a no-op when given an empty plugin list', async () => { + const client = makeMockSandboxClient(); + await adapter.installPluginsInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('throws when a plugin has no gemini-extension.json manifest', async () => { + mockAccess.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'noext', hostDir: '/tmp/noext' }, + ])).rejects.toThrow(/gemini-extension\.json/); + }); + + it('extracts each manifest-bearing plugin into ~/.gemini/extensions', async () => { + mockAccess.mockResolvedValue(undefined); + const client = makeMockSandboxClient(); + client.runCommand + .mockResolvedValueOnce({ stdout: '/root', stderr: '', exitCode: 0 }) // printf HOME + .mockResolvedValue({ stdout: '', stderr: '', exitCode: 0 }); // mkdir + + await adapter.installPluginsInSandbox(client as any, [ + { name: 'ext-a', hostDir: '/tmp/a' }, + { name: 'ext-b', hostDir: '/tmp/b' }, + ]); + + expect(mockUploadDir).toHaveBeenCalledTimes(2); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + '/tmp/a', + '/root/.gemini/extensions/ext-a', + 'gemini_ext_ext-a', + ); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + '/tmp/b', + '/root/.gemini/extensions/ext-b', + 'gemini_ext_ext-b', + ); + }); + }); }); diff --git a/src/agents/adapter.ts b/src/agents/adapter.ts index ab833d4..c74e88c 100644 --- a/src/agents/adapter.ts +++ b/src/agents/adapter.ts @@ -1,4 +1,4 @@ -import { AgentConfig, AgentResult } from '../types.js'; +import { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { ClaudeAdapter } from './claude.js'; import { CodexAdapter } from './codex.js'; @@ -34,6 +34,15 @@ export interface AgentAdapter { /** Extract agent session log from inside the sandbox. Returns raw log content or null. */ extractLog(client: MicrosandboxClient): Promise; + + /** + * Install plugin directory trees into the running sandbox so the agent CLI + * picks them up at startup. Each adapter knows the CLI-specific layout + * (e.g. `~/.claude/plugins//` + `~/.claude/settings.json` for Claude). + * Adapters whose CLI cannot load plugins in non-interactive mode raise a + * clear error here instead of silently succeeding. + */ + installPluginsInSandbox(client: MicrosandboxClient, plugins: ResolvedExecutorPlugin[]): Promise; } const KNOWN_ADAPTERS: Record AgentAdapter> = { diff --git a/src/agents/base.ts b/src/agents/base.ts index 4a7b9a3..aa141a2 100644 --- a/src/agents/base.ts +++ b/src/agents/base.ts @@ -1,4 +1,4 @@ -import type { AgentConfig, AgentResult } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { AgentAdapter } from './adapter.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { spawnAgent, spawnInteractive } from './spawn.js'; @@ -85,6 +85,23 @@ export abstract class BaseAdapter implements AgentAdapter { return null; } + /** + * Install plugin directories into the sandbox VM. Subclasses override with + * CLI-specific layout (Claude: `~/.claude/plugins/`, Codex: `~/.codex/plugins/`). + * Default raises a clear error so adapters that don't (yet) support plugins + * fail loudly when the user wires `executorPlugins` against them. + */ + async installPluginsInSandbox( + _client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + throw new Error( + `Agent adapter '${this.name}' does not support executorPlugins. ` + + `Either remove executorPlugins from config or switch executor to an adapter that supports plugin loading.`, + ); + } + /** Shared helper: spawn the agent process with piped stdio. */ protected spawn(args: string[], workDir: string, env?: Record, timeout?: number, stdin?: string): Promise { return spawnAgent(this.config.command, args, { cwd: workDir, env, timeout, stdin }); diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 71c00b5..47d77fc 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -1,5 +1,8 @@ -import type { AgentConfig, AgentResult } from '../types.js'; +import { access } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; +import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; export class ClaudeAdapter extends BaseAdapter { @@ -9,6 +12,13 @@ export class ClaudeAdapter extends BaseAdapter { readonly defaultEnvVar = 'ANTHROPIC_API_KEY'; readonly defaultBaseUrl = 'https://api.anthropic.com'; + /** + * Sandbox paths of plugins extracted by `installPluginsInSandbox()`. + * `sandboxCommand()` reads this to emit `--plugin-dir ` flags so the + * Claude CLI loads each plugin for the run. Populated only after install. + */ + private installedPluginDirs: string[] = []; + constructor(config: AgentConfig) { super(config); } @@ -19,7 +29,14 @@ export class ClaudeAdapter extends BaseAdapter { const schemaFlags = schema ? ` --output-format json --json-schema '${this.escapeForShell(JSON.stringify(schema))}'` : ''; - const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); + // --plugin-dir is the documented Claude CLI flag for loading a local + // plugin directory for the session. It's the simplest mechanism for our + // executorPlugins use case — no marketplace registration, no trust + // prompt, works cleanly in --print mode. + const pluginFlags = this.installedPluginDirs + .map((dir) => ` --plugin-dir '${dir}'`) + .join(''); + const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions${pluginFlags} ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); return cmd; } @@ -33,16 +50,47 @@ export class ClaudeAdapter extends BaseAdapter { workDir: string, env?: Record, ): Promise { + // Claude's --json-schema currently rejects top-level non-object schemas + // (the API returns "400 tools.N.custom.input_schema.type: Input should be 'object'"). + // Wrap non-object schemas under a single `result` property and unwrap before + // returning, so callers can hand us arrays / primitives transparently. + // (Codex has the same wrap dance in its own adapter.) + const rootSchema = schema as { type?: string }; + const needsWrapper = rootSchema.type !== 'object'; + const effectiveSchema = needsWrapper + ? { + type: 'object', + properties: { result: schema }, + required: ['result'], + additionalProperties: false, + } + : schema; + const args = [ '--print', '--output-format', 'json', '--json-schema', - JSON.stringify(schema), + JSON.stringify(effectiveSchema), ...(this.config.args ?? []), ]; - return this.spawn(args, workDir, env, undefined, prompt); + const result = await this.spawn(args, workDir, env, undefined, prompt); + + if (needsWrapper) { + try { + const envelope = JSON.parse(result.stdout) as Record; + const so = envelope.structured_output as Record | undefined; + if (so && 'result' in so) { + envelope.structured_output = so.result; + result.stdout = JSON.stringify(envelope); + } + } catch { + // Leave stdout untouched — parseEnvelope will handle it / trigger a retry. + } + } + + return result; } async extractLog(client: MicrosandboxClient): Promise { @@ -58,6 +106,43 @@ export class ClaudeAdapter extends BaseAdapter { } } + /** + * Install Claude Code plugins for the executor's CLI session. + * + * Each plugin is extracted under `$HOME/.claude/plugins//` and the + * resulting paths are stashed for `sandboxCommand()` to emit as + * `--plugin-dir ` flags. No marketplace registration or trust + * prompt — those can't be answered in `--print` mode. + */ + async installPluginsInSandbox( + client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + + await Promise.all(plugins.map(async (plugin) => { + const manifestPath = join(plugin.hostDir, '.claude-plugin', 'plugin.json'); + try { + await access(manifestPath); + } catch { + throw new Error( + `Plugin '${plugin.name}' is missing the Claude manifest at ${manifestPath}. ` + + `Each Claude Code plugin must contain a .claude-plugin/plugin.json file.`, + ); + } + })); + + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; + const pluginsRoot = `${home}/.claude/plugins`; + + this.installedPluginDirs = await Promise.all(plugins.map(async (plugin) => { + const destDir = `${pluginsRoot}/${plugin.name}`; + await uploadDirToSandbox(client, plugin.hostDir, destDir, `plugin_${plugin.name}`); + return destDir; + })); + } + protected parseEnvelope(result: AgentResult): AgentResult | null { try { const envelope = JSON.parse(result.stdout); diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 3ce6670..0997e72 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -1,8 +1,9 @@ -import { writeFile, readFile, rm } from 'node:fs/promises'; +import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { join } from 'node:path'; import { tmpdir } from 'node:os'; -import type { AgentConfig, AgentResult } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; +import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; export class CodexAdapter extends BaseAdapter { @@ -95,6 +96,102 @@ export class CodexAdapter extends BaseAdapter { return result; } + /** + * Install plugin skills into the Codex CLI's auto-discovered skills dir. + * + * Codex doesn't have a "plugin" abstraction like Claude's marketplaces; it + * auto-discovers individual skills under `${CODEX_HOME:-$HOME/.codex}/skills//`. + * For each plugin we iterate its `skills/` subtree and lay out each + * SKILL.md-bearing directory at `$CODEX_HOME/skills//`. + * + * A plugin contributes nothing to Codex if it has no `.codex-plugin/plugin.json` + * manifest, or if it has no `skills/` subdirectory with at least one + * SKILL.md. We fail fast in that case so the user knows the A/B comparison + * won't actually exercise the plugin. + */ + async installPluginsInSandbox( + client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + + type SkillInstall = { plugin: string; srcDir: string; skillName: string }; + + const perPluginSkills = await Promise.all(plugins.map(async (plugin): Promise => { + const manifestPath = join(plugin.hostDir, '.codex-plugin', 'plugin.json'); + try { + await access(manifestPath); + } catch { + throw new Error( + `Plugin '${plugin.name}' is missing the Codex manifest at ${manifestPath}. ` + + `Codex executors need each plugin to ship a .codex-plugin/plugin.json file.`, + ); + } + + const skillsDir = join(plugin.hostDir, 'skills'); + let entries; + try { + entries = await readdir(skillsDir, { withFileTypes: true }); + } catch { + throw new Error( + `Plugin '${plugin.name}' has no 'skills/' directory at ${skillsDir}. ` + + `Codex auto-discovers skills from individual subdirectories — bundle each as /skills//SKILL.md.`, + ); + } + + const skillCandidates = await Promise.all(entries + .filter((entry) => entry.isDirectory()) + .map(async (entry): Promise => { + const skillName = String(entry.name); + const skillDir = join(skillsDir, skillName); + try { + const md = await stat(join(skillDir, 'SKILL.md')); + if (!md.isFile()) return null; + } catch { + return null; + } + return { plugin: plugin.name, srcDir: skillDir, skillName }; + }), + ); + const skills = skillCandidates.filter((s): s is SkillInstall => s !== null); + + if (skills.length === 0) { + throw new Error( + `Plugin '${plugin.name}' contains no usable Codex skills (no //SKILL.md). ` + + `Each plugin must contribute at least one SKILL.md file under its skills/ subdirectory.`, + ); + } + return skills; + })); + + const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); + const codexHome = homeResult.stdout.trim() || '/root/.codex'; + const codexSkillsDir = `${codexHome}/skills`; + + const skillOwners = new Map(); + const installs: SkillInstall[] = []; + for (const skill of perPluginSkills.flat()) { + const prior = skillOwners.get(skill.skillName); + if (prior !== undefined) { + throw new Error( + `Skill '${skill.skillName}' is contributed by both '${prior}' and '${skill.plugin}'. ` + + `Codex requires skill names to be unique across all installed plugins.`, + ); + } + skillOwners.set(skill.skillName, skill.plugin); + installs.push(skill); + } + + await Promise.all(installs.map((skill) => + uploadDirToSandbox( + client, + skill.srcDir, + `${codexSkillsDir}/${skill.skillName}`, + `codex_skill_${skill.skillName}`, + ), + )); + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.codex/sessions/*.jsonl' -type f 2>/dev/null | sort | tail -1", diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index 9de4f88..03d3dcc 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -1,5 +1,8 @@ -import type { AgentConfig, AgentResult } from '../types.js'; +import { access } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; +import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; export class GeminiAdapter extends BaseAdapter { @@ -40,6 +43,53 @@ export class GeminiAdapter extends BaseAdapter { return this.spawn(args, workDir, env, undefined, prompt); } + /** + * Install plugin directories into the Gemini CLI's extensions folder. + * + * Gemini's extension model expects each extension dir to contain a + * `gemini-extension.json` manifest at its root (see Gemini CLI docs: + * Extensions reference). Auto-discovery loads extensions from + * `${GEMINI_HOME:-$HOME/.gemini}/extensions//`. + * + * For each plugin we look for `gemini-extension.json` at the plugin root. + * If present, the entire plugin directory is treated as a Gemini extension + * and laid out at the expected location. If absent, we throw — the plugin + * has nothing the Gemini CLI knows how to load, and an A/B comparison that + * silently no-ops would be misleading. + */ + async installPluginsInSandbox( + client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + + await Promise.all(plugins.map(async (plugin) => { + const manifestPath = join(plugin.hostDir, 'gemini-extension.json'); + try { + await access(manifestPath); + } catch { + throw new Error( + `Plugin '${plugin.name}' has no Gemini extension manifest at ${manifestPath}. ` + + `Gemini CLI loads extensions from a 'gemini-extension.json' file at the extension root. ` + + `Add one to the plugin directory or remove '${plugin.name}' from executorPlugins when running the Gemini executor.`, + ); + } + })); + + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; + const extensionsDir = `${home}/.gemini/extensions`; + + await Promise.all(plugins.map((plugin) => + uploadDirToSandbox( + client, + plugin.hostDir, + `${extensionsDir}/${plugin.name}`, + `gemini_ext_${plugin.name}`, + ), + )); + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.gemini/tmp/*/chats/session-*.jsonl' -type f 2>/dev/null | sort | tail -1", diff --git a/src/commands/execute.ts b/src/commands/execute.ts index 58654ee..a327c57 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -5,7 +5,7 @@ import { loadConfig } from '../core/config.js'; import { loadTestSuite, saveResult, saveBinaryResult, formatElapsed } from '../core/suite-io.js'; import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; -import { scaffoldWorkspace } from '../sandbox/scaffolding.js'; +import { scaffoldWorkspace, resolveExecutorPlugins } from '../sandbox/scaffolding.js'; import { WorkerPool } from '../sandbox/worker-pool.js'; import { createAdapter } from '../agents/adapter.js'; import { getPackageSource, getUrlSources, getFileSources } from '../types.js'; @@ -198,6 +198,19 @@ export async function executeTestCase( } } + // Plugins are intentionally executor-only — the judge sandbox stays + // plugin-free so its scoring is independent of the executor's tooling. + if (config.executorPlugins && config.executorPlugins.length > 0) { + const resolvedPlugins = await resolveExecutorPlugins(config.executorPlugins, paths.cacheRepos); + try { + await adapter.installPluginsInSandbox(client, resolvedPlugins); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + await saveResult(paths, testCase.id, 'plugin-install-error.log', message, target.name); + throw new Error(`Executor plugin install failed: ${message}`); + } + } + // Upload public file sources (docs directories, etc.) for the executor const publicFileSources = getFileSources(config.publicInfo ?? []); const publicSourceDirs = publicFileSources.length > 0 diff --git a/src/core/__tests__/config.test.ts b/src/core/__tests__/config.test.ts index b8bf405..0efe953 100644 --- a/src/core/__tests__/config.test.ts +++ b/src/core/__tests__/config.test.ts @@ -223,7 +223,7 @@ describe('loadConfig', () => { mockReadFile.mockResolvedValue(JSON.stringify(config)); await expect(loadConfig('/fake/config.json')).rejects.toThrow(/valid URL/); }); - + it('accepts secret pointing at $CLAUDE_CODE_OAUTH_TOKEN (auth mode resolved later by value prefix)', async () => { const config = { ...validConfig, @@ -233,4 +233,96 @@ describe('loadConfig', () => { const result = await loadConfig('/fake/config.json'); expect(result.agents?.judge?.secret?.value).toBe('$CLAUDE_CODE_OAUTH_TOKEN'); }); + + describe('executorPlugins', () => { + it('accepts a single local plugin', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', name: 'my-plugin', path: '/tmp/my-plugin' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorPlugins).toHaveLength(1); + expect(result.executorPlugins?.[0]).toMatchObject({ type: 'local', name: 'my-plugin', path: '/tmp/my-plugin' }); + }); + + it('accepts a git plugin with branch + subpath', async () => { + const config = { + ...validConfig, + executorPlugins: [{ + type: 'git', + name: 'shared', + url: 'https://example.com/repo.git', + branch: 'main', + subpath: 'plugins/shared', + }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorPlugins).toHaveLength(1); + }); + + it('throws when executorPlugins is not an array', async () => { + const config = { ...validConfig, executorPlugins: 'not-an-array' }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/executorPlugins must be an array/); + }); + + it('throws when a plugin entry is missing name', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', path: '/tmp/x' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/name/); + }); + + it('throws when a plugin entry has an invalid name with shell-special chars', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', name: 'bad name; rm -rf /', path: '/tmp/x' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/unsupported characters/); + }); + + it('throws when a plugin entry has an invalid type', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'url', name: 'a', url: 'https://x.com' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/invalid/i); + }); + + it('throws when a local plugin is missing path', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', name: 'a' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/path/); + }); + + it('throws when a git plugin is missing url', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'git', name: 'a' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/url/); + }); + + it('throws when two plugins share the same name', async () => { + const config = { + ...validConfig, + executorPlugins: [ + { type: 'local', name: 'a', path: '/tmp/a' }, + { type: 'local', name: 'a', path: '/tmp/b' }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/duplicated/); + }); + }); }); diff --git a/src/core/config.ts b/src/core/config.ts index a1d618b..cb03cc6 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -24,6 +24,47 @@ export async function loadConfig(configPath: string): Promise { const VALID_SOURCE_TYPES = ['local', 'git', 'url', 'package']; +const VALID_PLUGIN_TYPES = ['local', 'git']; + +function validateExecutorPluginEntry(plugin: Record, prefix: string): void { + if (!plugin || typeof plugin !== 'object' || Array.isArray(plugin)) { + throw new Error(`${prefix} must be an object`); + } + + if (!plugin.name || typeof plugin.name !== 'string') { + throw new Error(`${prefix} requires a non-empty 'name' string`); + } + if (!/^[a-zA-Z0-9._-]+$/.test(plugin.name)) { + throw new Error( + `${prefix}.name '${plugin.name}' contains unsupported characters. ` + + `Use only letters, digits, '.', '_', or '-'.`, + ); + } + + if (!plugin.type || typeof plugin.type !== 'string') { + throw new Error(`${prefix} missing required field: type`); + } + + if (!VALID_PLUGIN_TYPES.includes(plugin.type)) { + throw new Error( + `${prefix}.type '${plugin.type}' is invalid. Must be one of: ${VALID_PLUGIN_TYPES.map(t => `'${t}'`).join(', ')}` + ); + } + + switch (plugin.type) { + case 'local': + if (!plugin.path || typeof plugin.path !== 'string') { + throw new Error(`${prefix} type 'local' requires path to be set`); + } + break; + case 'git': + if (!plugin.url || typeof plugin.url !== 'string') { + throw new Error(`${prefix} type 'git' requires url to be set`); + } + break; + } +} + function validateSourceEntry(source: Record, prefix: string): void { if (!source || typeof source !== 'object' || Array.isArray(source)) { throw new Error(`${prefix} must be an object`); @@ -89,6 +130,23 @@ export function validateConfig(data: unknown, configPath?: string): Config { } } + // Validate executorPlugins (optional) + if (obj.executorPlugins !== undefined) { + if (!Array.isArray(obj.executorPlugins)) { + throw new Error('Config executorPlugins must be an array'); + } + const seenNames = new Set(); + for (let i = 0; i < obj.executorPlugins.length; i++) { + const entry = obj.executorPlugins[i] as Record; + validateExecutorPluginEntry(entry, `executorPlugins[${i}]`); + const name = entry.name as string; + if (seenNames.has(name)) { + throw new Error(`executorPlugins[${i}].name '${name}' is duplicated`); + } + seenNames.add(name); + } + } + // Validate targets if (!Array.isArray(obj.targets) || obj.targets.length === 0) { throw new Error('Config requires at least one target in targets array'); diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index b36c57a..eabb9dc 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -1,10 +1,10 @@ import { readFile, readdir, stat as fsStat } from 'node:fs/promises'; import { execFile } from 'node:child_process'; -import { basename, join, relative } from 'node:path'; +import { basename, join, relative, resolve as resolvePath } from 'node:path'; import { tmpdir } from 'node:os'; import type { MicrosandboxClient } from './microsandbox.js'; -import type { Config, TestCase, SourceConfig } from '../types.js'; -import { resolveSources } from '../core/source-resolver.js'; +import type { Config, TestCase, SourceConfig, ExecutorPlugin, ResolvedExecutorPlugin } from '../types.js'; +import { resolveSources, resolveSource } from '../core/source-resolver.js'; /** Directories excluded from source uploads — these are large and not useful for evaluation. */ const EXCLUDED_DIRS = [ @@ -82,7 +82,7 @@ async function readDirRecursive( * Create a tar.gz archive of a directory, excluding common bloat dirs. * Archives are cached on disk so concurrent sandboxes reuse the same tarball. */ -async function getSourceArchive(srcPath: string): Promise { +export async function getSourceArchive(srcPath: string): Promise { const cached = sourceArchiveCache.get(srcPath); if (cached) { try { @@ -115,6 +115,35 @@ async function getSourceArchive(srcPath: string): Promise { return tarPath; } +/** + * Tar a host directory, upload the archive to the sandbox, and extract it + * into `sandboxDestDir`. Used for any "copy this directory tree to a known + * path inside the VM" operation (source uploads, plugin installs). + * + * `archiveLabel` should be a slug-safe identifier (used only to name the + * temporary tarball path inside the sandbox). + */ +export async function uploadDirToSandbox( + client: MicrosandboxClient, + hostDir: string, + sandboxDestDir: string, + archiveLabel: string, +): Promise { + const tarPath = await getSourceArchive(hostDir); + const tarData = await readFile(tarPath); + const sandboxTarPath = `/tmp/_${archiveLabel}.tar.gz`; + await client.uploadBinaryFile(sandboxTarPath, tarData); + const result = await client.runCommand( + `mkdir -p '${sandboxDestDir}' && tar xzf '${sandboxTarPath}' -C '${sandboxDestDir}' && rm -f '${sandboxTarPath}'`, + ); + if (result.exitCode !== 0) { + throw new Error( + `Failed to extract '${hostDir}' into sandbox at '${sandboxDestDir}': ` + + `${result.stderr || result.stdout}`, + ); + } +} + /** * Scaffolds a sandbox workspace with up to 3 optional layers: * - Layer 2 (Template): uploads local template directory to /workspace/ @@ -238,15 +267,37 @@ export async function uploadSources( const targetPrefix = `/workspace/sources/${dirName}/`; sandboxDirs.push(targetPrefix); - const tarPath = await getSourceArchive(srcPath); - const tarData = await readFile(tarPath); - const sandboxTarPath = `/tmp/_sources_${dirName}.tar.gz`; - await client.uploadBinaryFile(sandboxTarPath, tarData); - await client.runCommand( - `mkdir -p '${targetPrefix}' && tar xzf '${sandboxTarPath}' -C '${targetPrefix}' && rm -f '${sandboxTarPath}'`, - ); + await uploadDirToSandbox(client, srcPath, targetPrefix, `sources_${dirName}`); } } return sandboxDirs; } + +/** + * Resolve `config.executorPlugins` to host-side directories. Reuses the + * existing source-resolver for git clones (cached under `cacheRepos`) so + * repeat runs don't re-clone the same plugin source. + * + * Returns an empty array when no plugins are configured. + */ +export async function resolveExecutorPlugins( + plugins: ExecutorPlugin[] | undefined, + cacheRepos: string, +): Promise { + if (!plugins || plugins.length === 0) return []; + + return Promise.all(plugins.map(async (plugin) => { + const source: SourceConfig = plugin.type === 'local' + ? { type: 'local', path: plugin.path } + : { + type: 'git', + url: plugin.url, + branch: plugin.branch, + subpath: plugin.subpath, + sparse: plugin.sparse, + }; + const hostDir = await resolveSource(source, { reposDir: cacheRepos }); + return { name: plugin.name, hostDir: resolvePath(hostDir) }; + })); +} diff --git a/src/scoring/__tests__/judge.test.ts b/src/scoring/__tests__/judge.test.ts index c8952c6..0a3691f 100644 --- a/src/scoring/__tests__/judge.test.ts +++ b/src/scoring/__tests__/judge.test.ts @@ -71,6 +71,7 @@ function makeMockAdapter(opts: { stdout: string }) { sandboxCommand: vi.fn().mockReturnValue('claude --print "prompt"'), extractResult: vi.fn().mockImplementation((stdout: string) => stdout), extractLog: vi.fn().mockResolvedValue(null), + installPluginsInSandbox: vi.fn().mockResolvedValue(undefined), }; } diff --git a/src/types.ts b/src/types.ts index 1621ace..c6739ff 100644 --- a/src/types.ts +++ b/src/types.ts @@ -148,6 +148,41 @@ export interface WorkspaceConfig { setupScript?: string; } +/** + * Source of a plugin directory tree to install into the executor's agent CLI. + * Resolved to a local filesystem path by the source resolver; the adapter then + * lays the tree out wherever its CLI expects to find plugins. + */ +export interface LocalExecutorPlugin { + type: 'local'; + /** Plugin slug — used as the directory name under the CLI's plugins dir. */ + name: string; + /** Absolute or relative path to a directory containing the adapter-specific plugin manifest. */ + path: string; +} + +export interface GitExecutorPlugin { + type: 'git'; + name: string; + url: string; + branch?: string; + /** Path within the cloned repo that contains the plugin manifest. */ + subpath?: string; + sparse?: string[]; +} + +export type ExecutorPlugin = LocalExecutorPlugin | GitExecutorPlugin; + +/** + * An ExecutorPlugin after host-side resolution. The adapter receives this and + * decides how to install it inside the sandbox VM. + */ +export interface ResolvedExecutorPlugin { + name: string; + /** Absolute path on the host to the plugin directory. */ + hostDir: string; +} + export interface SecretConfig { /** Raw value or "$ENV_VAR" reference resolved from host environment. */ value: string; @@ -180,6 +215,14 @@ export interface Config { }; targets: TargetConfig[]; workspace?: WorkspaceConfig; + /** + * Plugin directories to install into the executor agent's CLI inside the sandbox VM. + * Each adapter knows where its CLI expects plugins on disk (Claude: `~/.claude/plugins/`, + * Codex: `~/.codex/plugins/`, Gemini: not yet supported in non-interactive mode). + * Plugins are scoped to the executor — the judge sandbox is intentionally not seeded + * with these so judge scoring stays independent of the executor's tooling. + */ + executorPlugins?: ExecutorPlugin[]; sandbox: SandboxConfig; }