From ef466c06676182f8b772942498f5c01cd4df3db7 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 06:32:58 +1200 Subject: [PATCH 1/5] feat(executor): install agent CLI plugins into the executor sandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an `executorPlugins` config field that seeds Claude Code / Codex / Gemini plugins into the executor's agent CLI before each run. Lets you A/B-test whether shipping a plugin (skills, slash commands, marketplace bundles) measurably improves an agent's ability to use the SDK under evaluation. Plugins are installed in the executor sandbox only — the judge sandbox stays plugin-free so its scoring is independent of the executor's tooling. Run the same suite twice (once without `executorPlugins`, once with) and compare per-test-case judge scores. Per-adapter install layout (each adapter owns its own logic, matching how `installCommand` and `extractLog` already work): - claude: extracts the plugin dir to `\$HOME/.claude/plugins//` and loads it for each session via the documented `--plugin-dir` CLI flag. Requires `.claude-plugin/plugin.json` at the plugin root. Marketplace registration is intentionally skipped — Claude Code's marketplace flow prompts for trust acceptance, which can't be answered in `--print` mode. - codex: walks the plugin's `skills/` subtree and extracts each `SKILL.md`-bearing dir to `\$CODEX_HOME/skills//` (auto-discovered). Requires `.codex-plugin/plugin.json` and rejects duplicate skill names across plugins. - gemini: extracts the whole plugin dir to `\$HOME/.gemini/extensions//`. Requires `gemini-extension.json` at the plugin root. - custom: throws a clear error — silently no-op'ing would make an A/B run meaningless. Plumbing: - Extracted shared `uploadDirToSandbox()` helper in `scaffolding.ts`; `uploadSources()` now uses it too. - New `resolveExecutorPlugins()` reuses the existing source resolver so git-sourced plugins share the same `cache/repos/` cache. - Wired between agent-CLI install and agent run in `execute.ts`. Failures write `plugin-install-error.log` to the run dir. Validation: plugin names are slug-safe (letters/digits/\`.\`/\`_\`/\`-\`), unique within the array, and each entry's discriminator-required field (\`path\` for local, \`url\` for git) is checked at config-load time. Adapter-compatibility is enforced at install time, not load time, since support depends on the CLI mode the executor actually runs in. Also fixes a pre-existing framework bug surfaced along the way: the Claude adapter's \`spawnWithSchema\` now wraps non-object JSON schemas under a single \`result\` property and unwraps \`envelope.structured_output.result\` before returning. Claude's \`--json-schema\` rejects top-level non-object schemas; same envelope dance the Codex adapter already does. Docs: README + \`skills/_reference/config-schema.md\` document the new field and the cross-runtime support matrix. Type-check + lint clean; 348 tests pass. --- README.md | 39 ++++++++++ skills/_reference/config-schema.md | 45 +++++++++++ src/agents/__tests__/claude.test.ts | 56 ++++++++++++++ src/agents/__tests__/codex.test.ts | 112 +++++++++++++++++++++++++++- src/agents/__tests__/custom.test.ts | 14 ++++ src/agents/__tests__/gemini.test.ts | 56 ++++++++++++++ src/agents/adapter.ts | 11 ++- src/agents/base.ts | 19 ++++- src/agents/claude.ts | 112 +++++++++++++++++++++++++++- src/agents/codex.ts | 106 +++++++++++++++++++++++++- src/agents/gemini.ts | 55 +++++++++++++- src/commands/execute.ts | 19 ++++- src/core/__tests__/config.test.ts | 92 +++++++++++++++++++++++ src/core/config.ts | 58 ++++++++++++++ src/sandbox/scaffolding.ts | 84 ++++++++++++++++++--- src/scoring/__tests__/judge.test.ts | 1 + src/types.ts | 43 +++++++++++ 17 files changed, 900 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index eac76d8..0f52bab 100644 --- a/README.md +++ b/README.md @@ -419,6 +419,45 @@ Template files and setup scripts for the test workspace: | `template` | Local directory uploaded to `/workspace/` in the sandbox | | `setupScript` | Script file uploaded and executed during scaffolding | +### Executor plugins + +Install Claude Code / Codex / Gemini plugins into the executor sandbox before the agent runs. Useful for A/B-testing whether shipping a plugin (skills, slash commands, marketplace bundles) measurably improves an agent's ability to use your SDK. + +Plugins are installed **only** in the executor sandbox — the judge sandbox stays plugin-free so its scoring is independent of the executor's tooling. Run the same suite twice (once without `executorPlugins`, once with) and compare per-test-case judge scores in the inspect UI. + +```json +{ + "executorPlugins": [ + { "type": "local", "name": "my-sdk-skills", "path": "/abs/path/to/plugin-dir" }, + { + "type": "git", + "name": "shared-skills", + "url": "https://github.com/example/skills.git", + "branch": "main", + "subpath": "plugins/shared-skills" + } + ] +} +``` + +| Field | Description | +|---|---| +| `type` | `"local"` or `"git"` | +| `name` | Plugin slug (letters, digits, `.`, `_`, `-`). Must match the plugin manifest's name and must be unique across `executorPlugins`. | +| `path` | For `type: "local"`. Directory on the host containing the adapter-specific manifest. | +| `url` / `branch` / `subpath` / `sparse` | For `type: "git"`. Same semantics as `GitSource` under `privateInfo`. | + +What each adapter expects inside the plugin directory: + +| Adapter | Required file(s) | Where it lands in the sandbox | +|---|---|---| +| `claude` | `.claude-plugin/plugin.json` | Plugin dir extracted to `$HOME/.claude/plugins//`, then loaded via the documented `--plugin-dir ` CLI flag at each invocation. (Marketplace registration is intentionally skipped — Claude Code's marketplace flow prompts for trust, which can't be answered in `--print` mode.) | +| `codex` | `.codex-plugin/plugin.json` plus one or more `skills//SKILL.md` | Each `skills//` extracted to `$CODEX_HOME/skills//`. Codex auto-discovers skills from that directory. | +| `gemini` | `gemini-extension.json` at the plugin root | The whole plugin dir extracted to `$HOME/.gemini/extensions//`. | +| custom | — | Not supported. The adapter raises a clear error at install time. | + +Adapters fail fast at install time if the required manifest is missing, so an A/B run cannot silently no-op against the wrong CLI. + ### Sandbox Resource limits, secrets, and environment variables for sandbox VMs: diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 6e96773..24b8279 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -9,6 +9,7 @@ | `agents` | `object` | No | Per-role agent configuration. | | `targets` | `TargetConfig[]` | **Yes** | Non-empty array. Docker images for sandboxed execution. | | `workspace` | `WorkspaceConfig` | No | Workspace template and setup. | +| `executorPlugins` | `ExecutorPlugin[]` | No | Plugin directories installed into the executor's agent CLI inside the sandbox (Claude marketplace, Codex skills, Gemini extensions). Not installed in the judge sandbox — that's intentional, so the judge stays independent of the executor's tooling. | | `sandbox` | `SandboxConfig` | **Yes** | Must be an object (can be `{}`). Resource limits, secrets, env vars. | ## SourceConfig (discriminated union on `type`) @@ -136,6 +137,49 @@ Custom agents (any command not in the table above) **must** provide `envVar` and | `template` | `string` | No — local directory to copy into sandbox workspace | | `setupScript` | `string` | No — path to script run during workspace setup | +## ExecutorPlugin (discriminated union on `type`) + +A plugin tree installed into the executor's agent CLI. Use these to A/B test +whether shipping skills/plugins to the executor improves judge scores. Plugins +are installed **only** in the executor sandbox; the judge sandbox is kept +plugin-free so its scoring is independent of the executor's tooling. + +Each entry has a `name` (slug — letters/digits/`.`/`_`/`-` only) plus the +discriminator: + +### LocalExecutorPlugin (`type: "local"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"local"` | Yes | +| `name` | `string` | Yes — plugin slug | +| `path` | `string` | Yes — absolute or relative directory on the host | + +### GitExecutorPlugin (`type: "git"`) + +| Field | Type | Required | +|-------|------|----------| +| `type` | `"git"` | Yes | +| `name` | `string` | Yes — plugin slug | +| `url` | `string` | Yes — git repository URL | +| `branch` | `string` | No | +| `subpath` | `string` | No — path within the repo to the plugin dir | +| `sparse` | `string[]` | No — sparse checkout paths | + +### Per-adapter expectations + +What an adapter requires inside the plugin directory: + +| Adapter | Required file(s) | Sandbox destination | +|---|---|---| +| `claude` | `.claude-plugin/plugin.json` at plugin root | Plugin dir extracted to `$HOME/.claude/plugins//`; loaded for each session via the `--plugin-dir ` CLI flag. | +| `codex` | `.codex-plugin/plugin.json` at plugin root, and one or more `skills//SKILL.md` files | Each `skills//` dir extracted to `$CODEX_HOME/skills//` (auto-discovered). | +| `gemini` | `gemini-extension.json` at plugin root | Entire plugin dir extracted to `$HOME/.gemini/extensions//`. | +| custom | — | Not supported. Adapter throws a clear error if `executorPlugins` is non-empty. | + +Each adapter fails fast at install time if its required file is missing — the +A/B comparison won't silently no-op. + ## Validation Rules 1. Root must be a JSON object @@ -147,6 +191,7 @@ Custom agents (any command not in the table above) **must** provide `envVar` and 7. `agents.executor` and `agents.judge` must have `secret.value` (non-empty string) 8. Custom agents must provide `envVar` and `baseUrl` in their secret 9. `baseUrl` must be a parseable URL +10. `executorPlugins`, if present, must be an array; each entry needs a `name` (slug-safe) and a valid `type` (`local` or `git`); names must be unique ## Minimal Examples diff --git a/src/agents/__tests__/claude.test.ts b/src/agents/__tests__/claude.test.ts index 7e18993..3d51a8a 100644 --- a/src/agents/__tests__/claude.test.ts +++ b/src/agents/__tests__/claude.test.ts @@ -1,15 +1,28 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { access } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; +import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; import { ClaudeAdapter } from '../claude.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; +import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; vi.mock('../spawn.js', () => ({ spawnAgent: vi.fn(), spawnInteractive: vi.fn(), })); +vi.mock('node:fs/promises', () => ({ + access: vi.fn(), +})); + +vi.mock('../../sandbox/scaffolding.js', () => ({ + uploadDirToSandbox: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); +const mockAccess = vi.mocked(access); +const mockUploadDir = vi.mocked(uploadDirToSandbox); describe('ClaudeAdapter', () => { let adapter: ClaudeAdapter; @@ -131,4 +144,47 @@ describe('ClaudeAdapter', () => { expect(adapter.installCommand).toBe('npm i -g @anthropic-ai/claude-code'); }); }); + + describe('installPluginsInSandbox', () => { + it('is a no-op when given an empty plugin list', async () => { + const client = makeMockSandboxClient(); + await adapter.installPluginsInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + expect(client.uploadFiles).not.toHaveBeenCalled(); + }); + + it('throws clearly when a plugin is missing its Claude manifest', async () => { + mockAccess.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'broken', hostDir: '/tmp/broken' }, + ])).rejects.toThrow(/\.claude-plugin\/plugin\.json/); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('extracts each plugin into /root/.claude/plugins/ and records the paths', async () => { + mockAccess.mockResolvedValue(undefined); + const client = makeMockSandboxClient(); + + await adapter.installPluginsInSandbox(client as any, [ + { name: 'plugin-a', hostDir: '/tmp/a' }, + { name: 'plugin-b', hostDir: '/tmp/b' }, + ]); + + expect(mockUploadDir).toHaveBeenCalledTimes(2); + expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/a', '/root/.claude/plugins/plugin-a', 'plugin_plugin-a'); + expect(mockUploadDir).toHaveBeenCalledWith(client, '/tmp/b', '/root/.claude/plugins/plugin-b', 'plugin_plugin-b'); + + // sandboxCommand should now emit --plugin-dir for each plugin. + const cmd = adapter.sandboxCommand('do the thing'); + expect(cmd).toContain("--plugin-dir '/root/.claude/plugins/plugin-a'"); + expect(cmd).toContain("--plugin-dir '/root/.claude/plugins/plugin-b'"); + }); + + it('sandboxCommand does not include --plugin-dir flags when no plugins have been installed', () => { + const fresh = new ClaudeAdapter({ command: 'claude' }); + const cmd = fresh.sandboxCommand('do the thing'); + expect(cmd).not.toContain('--plugin-dir'); + }); + }); }); diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index 49b62d3..7fab352 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -1,8 +1,10 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; -import { writeFile, readFile, rm } from 'node:fs/promises'; +import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; +import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; import { CodexAdapter } from '../codex.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; +import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; vi.mock('../spawn.js', () => ({ spawnAgent: vi.fn(), @@ -13,6 +15,13 @@ vi.mock('node:fs/promises', () => ({ writeFile: vi.fn().mockResolvedValue(undefined), readFile: vi.fn(), rm: vi.fn().mockResolvedValue(undefined), + access: vi.fn(), + readdir: vi.fn(), + stat: vi.fn(), +})); + +vi.mock('../../sandbox/scaffolding.js', () => ({ + uploadDirToSandbox: vi.fn(), })); const mockSpawnAgent = vi.mocked(spawnAgent); @@ -20,6 +29,10 @@ const mockSpawnInteractive = vi.mocked(spawnInteractive); const mockWriteFile = vi.mocked(writeFile); const mockReadFile = vi.mocked(readFile); const mockRm = vi.mocked(rm); +const mockAccess = vi.mocked(access); +const mockReaddir = vi.mocked(readdir); +const mockStat = vi.mocked(stat); +const mockUploadDir = vi.mocked(uploadDirToSandbox); describe('CodexAdapter', () => { let adapter: CodexAdapter; @@ -120,4 +133,101 @@ describe('CodexAdapter', () => { expect(adapter.installCommand).toBe('npm i -g @openai/codex@0.93.0'); }); }); + + describe('installPluginsInSandbox', () => { + function makeDirent(name: string, isDir: boolean) { + return { + name, + isDirectory: () => isDir, + isFile: () => !isDir, + } as any; + } + + it('is a no-op when given an empty plugin list', async () => { + const client = makeMockSandboxClient(); + await adapter.installPluginsInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('throws when a plugin is missing its Codex manifest', async () => { + mockAccess.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'broken', hostDir: '/tmp/broken' }, + ])).rejects.toThrow(/\.codex-plugin\/plugin\.json/); + expect(mockUploadDir).not.toHaveBeenCalled(); + }); + + it('throws when a plugin has no skills/ directory', async () => { + mockAccess.mockResolvedValueOnce(undefined); + mockReaddir.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'empty', hostDir: '/tmp/empty' }, + ])).rejects.toThrow(/no 'skills\/' directory/); + }); + + it('throws when a plugin contributes no SKILL.md-bearing dirs', async () => { + mockAccess.mockResolvedValueOnce(undefined); + mockReaddir.mockResolvedValueOnce([ + makeDirent('not-a-skill', true), + ]); + mockStat.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'shell', hostDir: '/tmp/shell' }, + ])).rejects.toThrow(/no usable Codex skills/); + }); + + it('extracts each plugin skill into $CODEX_HOME/skills/', async () => { + mockAccess.mockResolvedValue(undefined); + // One plugin with two skills. + mockReaddir.mockResolvedValueOnce([ + makeDirent('skill-one', true), + makeDirent('skill-two', true), + makeDirent('not-a-dir', false), + ]); + mockStat.mockResolvedValue({ isFile: () => true } as any); + + const client = makeMockSandboxClient(); + client.runCommand + .mockResolvedValueOnce({ stdout: '/root/.codex', stderr: '', exitCode: 0 }) // printf CODEX_HOME + .mockResolvedValue({ stdout: '', stderr: '', exitCode: 0 }); // mkdir, etc. + + await adapter.installPluginsInSandbox(client as any, [ + { name: 'bundle', hostDir: '/tmp/bundle' }, + ]); + + expect(mockUploadDir).toHaveBeenCalledTimes(2); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + expect.stringContaining('skills/skill-one'), + '/root/.codex/skills/skill-one', + 'codex_skill_skill-one', + ); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + expect.stringContaining('skills/skill-two'), + '/root/.codex/skills/skill-two', + 'codex_skill_skill-two', + ); + }); + + it('throws when two plugins contribute the same skill name', async () => { + mockAccess.mockResolvedValue(undefined); + // Two plugins, each contributing a skill called 'shared'. + mockReaddir + .mockResolvedValueOnce([makeDirent('shared', true)]) + .mockResolvedValueOnce([makeDirent('shared', true)]); + mockStat.mockResolvedValue({ isFile: () => true } as any); + + const client = makeMockSandboxClient(); + client.runCommand.mockResolvedValue({ stdout: '/root/.codex', stderr: '', exitCode: 0 }); + + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'plugin-a', hostDir: '/tmp/a' }, + { name: 'plugin-b', hostDir: '/tmp/b' }, + ])).rejects.toThrow(/contributed by more than one plugin/); + }); + }); }); diff --git a/src/agents/__tests__/custom.test.ts b/src/agents/__tests__/custom.test.ts index 51fbba9..214f033 100644 --- a/src/agents/__tests__/custom.test.ts +++ b/src/agents/__tests__/custom.test.ts @@ -182,4 +182,18 @@ describe('CustomAdapter', () => { }); }); + describe('installPluginsInSandbox', () => { + it('is a no-op when given an empty plugin list', async () => { + const adapter = new CustomAdapter({ command: 'my-tool' }); + await expect(adapter.installPluginsInSandbox({} as any, [])).resolves.toBeUndefined(); + }); + + it('throws a clear error when given plugins (custom CLIs have no documented plugin layout)', async () => { + const adapter = new CustomAdapter({ command: 'my-tool' }); + await expect(adapter.installPluginsInSandbox({} as any, [ + { name: 'x', hostDir: '/tmp/x' }, + ])).rejects.toThrow(/does not support executorPlugins/); + }); + }); + }); diff --git a/src/agents/__tests__/gemini.test.ts b/src/agents/__tests__/gemini.test.ts index fa02aee..06393a5 100644 --- a/src/agents/__tests__/gemini.test.ts +++ b/src/agents/__tests__/gemini.test.ts @@ -1,15 +1,28 @@ import { describe, it, expect, vi, beforeEach } from 'vitest'; +import { access } from 'node:fs/promises'; import { spawnAgent, spawnInteractive } from '../spawn.js'; +import { uploadDirToSandbox } from '../../sandbox/scaffolding.js'; import { GeminiAdapter } from '../gemini.js'; import { makeAgentResult } from '../../__tests__/helpers/fixtures.js'; +import { makeMockSandboxClient } from '../../__tests__/helpers/mock-sandbox-client.js'; vi.mock('../spawn.js', () => ({ spawnAgent: vi.fn(), spawnInteractive: vi.fn(), })); +vi.mock('node:fs/promises', () => ({ + access: vi.fn(), +})); + +vi.mock('../../sandbox/scaffolding.js', () => ({ + uploadDirToSandbox: vi.fn(), +})); + const mockSpawnAgent = vi.mocked(spawnAgent); const mockSpawnInteractive = vi.mocked(spawnInteractive); +const mockAccess = vi.mocked(access); +const mockUploadDir = vi.mocked(uploadDirToSandbox); describe('GeminiAdapter', () => { let adapter: GeminiAdapter; @@ -104,4 +117,47 @@ describe('GeminiAdapter', () => { expect(adapter.installCommand).toBe('npm i -g @google/gemini-cli'); }); }); + + describe('installPluginsInSandbox', () => { + it('is a no-op when given an empty plugin list', async () => { + const client = makeMockSandboxClient(); + await adapter.installPluginsInSandbox(client as any, []); + expect(client.runCommand).not.toHaveBeenCalled(); + }); + + it('throws when a plugin has no gemini-extension.json manifest', async () => { + mockAccess.mockRejectedValueOnce(new Error('ENOENT')); + const client = makeMockSandboxClient(); + await expect(adapter.installPluginsInSandbox(client as any, [ + { name: 'noext', hostDir: '/tmp/noext' }, + ])).rejects.toThrow(/gemini-extension\.json/); + }); + + it('extracts each manifest-bearing plugin into ~/.gemini/extensions', async () => { + mockAccess.mockResolvedValue(undefined); + const client = makeMockSandboxClient(); + client.runCommand + .mockResolvedValueOnce({ stdout: '/root', stderr: '', exitCode: 0 }) // printf HOME + .mockResolvedValue({ stdout: '', stderr: '', exitCode: 0 }); // mkdir + + await adapter.installPluginsInSandbox(client as any, [ + { name: 'ext-a', hostDir: '/tmp/a' }, + { name: 'ext-b', hostDir: '/tmp/b' }, + ]); + + expect(mockUploadDir).toHaveBeenCalledTimes(2); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + '/tmp/a', + '/root/.gemini/extensions/ext-a', + 'gemini_ext_ext-a', + ); + expect(mockUploadDir).toHaveBeenCalledWith( + client, + '/tmp/b', + '/root/.gemini/extensions/ext-b', + 'gemini_ext_ext-b', + ); + }); + }); }); diff --git a/src/agents/adapter.ts b/src/agents/adapter.ts index ab833d4..c74e88c 100644 --- a/src/agents/adapter.ts +++ b/src/agents/adapter.ts @@ -1,4 +1,4 @@ -import { AgentConfig, AgentResult } from '../types.js'; +import { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { ClaudeAdapter } from './claude.js'; import { CodexAdapter } from './codex.js'; @@ -34,6 +34,15 @@ export interface AgentAdapter { /** Extract agent session log from inside the sandbox. Returns raw log content or null. */ extractLog(client: MicrosandboxClient): Promise; + + /** + * Install plugin directory trees into the running sandbox so the agent CLI + * picks them up at startup. Each adapter knows the CLI-specific layout + * (e.g. `~/.claude/plugins//` + `~/.claude/settings.json` for Claude). + * Adapters whose CLI cannot load plugins in non-interactive mode raise a + * clear error here instead of silently succeeding. + */ + installPluginsInSandbox(client: MicrosandboxClient, plugins: ResolvedExecutorPlugin[]): Promise; } const KNOWN_ADAPTERS: Record AgentAdapter> = { diff --git a/src/agents/base.ts b/src/agents/base.ts index 4a7b9a3..aa141a2 100644 --- a/src/agents/base.ts +++ b/src/agents/base.ts @@ -1,4 +1,4 @@ -import type { AgentConfig, AgentResult } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { AgentAdapter } from './adapter.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; import { spawnAgent, spawnInteractive } from './spawn.js'; @@ -85,6 +85,23 @@ export abstract class BaseAdapter implements AgentAdapter { return null; } + /** + * Install plugin directories into the sandbox VM. Subclasses override with + * CLI-specific layout (Claude: `~/.claude/plugins/`, Codex: `~/.codex/plugins/`). + * Default raises a clear error so adapters that don't (yet) support plugins + * fail loudly when the user wires `executorPlugins` against them. + */ + async installPluginsInSandbox( + _client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + throw new Error( + `Agent adapter '${this.name}' does not support executorPlugins. ` + + `Either remove executorPlugins from config or switch executor to an adapter that supports plugin loading.`, + ); + } + /** Shared helper: spawn the agent process with piped stdio. */ protected spawn(args: string[], workDir: string, env?: Record, timeout?: number, stdin?: string): Promise { return spawnAgent(this.config.command, args, { cwd: workDir, env, timeout, stdin }); diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 71c00b5..a5b1ded 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -1,5 +1,8 @@ -import type { AgentConfig, AgentResult } from '../types.js'; +import { access } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; +import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; export class ClaudeAdapter extends BaseAdapter { @@ -9,6 +12,13 @@ export class ClaudeAdapter extends BaseAdapter { readonly defaultEnvVar = 'ANTHROPIC_API_KEY'; readonly defaultBaseUrl = 'https://api.anthropic.com'; + /** + * Sandbox paths of plugins extracted by `installPluginsInSandbox()`. + * `sandboxCommand()` reads this to emit `--plugin-dir ` flags so the + * Claude CLI loads each plugin for the run. Populated only after install. + */ + private installedPluginDirs: string[] = []; + constructor(config: AgentConfig) { super(config); } @@ -19,7 +29,14 @@ export class ClaudeAdapter extends BaseAdapter { const schemaFlags = schema ? ` --output-format json --json-schema '${this.escapeForShell(JSON.stringify(schema))}'` : ''; - const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); + // --plugin-dir is the documented Claude CLI flag for loading a local + // plugin directory for the session. It's the simplest mechanism for our + // executorPlugins use case — no marketplace registration, no trust + // prompt, works cleanly in --print mode. + const pluginFlags = this.installedPluginDirs + .map((dir) => ` --plugin-dir '${dir}'`) + .join(''); + const cmd = `cd ${workDir} && IS_SANDBOX=1 claude --print --dangerously-skip-permissions${pluginFlags} ${args.join(' ')} '${escaped}'${schemaFlags}`.trimEnd(); return cmd; } @@ -33,16 +50,47 @@ export class ClaudeAdapter extends BaseAdapter { workDir: string, env?: Record, ): Promise { + // Claude's --json-schema currently rejects top-level non-object schemas + // (the API returns "400 tools.N.custom.input_schema.type: Input should be 'object'"). + // Wrap non-object schemas under a single `result` property and unwrap before + // returning, so callers can hand us arrays / primitives transparently. + // (Codex has the same wrap dance in its own adapter.) + const rootSchema = schema as { type?: string }; + const needsWrapper = rootSchema.type !== 'object'; + const effectiveSchema = needsWrapper + ? { + type: 'object', + properties: { result: schema }, + required: ['result'], + additionalProperties: false, + } + : schema; + const args = [ '--print', '--output-format', 'json', '--json-schema', - JSON.stringify(schema), + JSON.stringify(effectiveSchema), ...(this.config.args ?? []), ]; - return this.spawn(args, workDir, env, undefined, prompt); + const result = await this.spawn(args, workDir, env, undefined, prompt); + + if (needsWrapper) { + try { + const envelope = JSON.parse(result.stdout) as Record; + const so = envelope.structured_output as Record | undefined; + if (so && 'result' in so) { + envelope.structured_output = so.result; + result.stdout = JSON.stringify(envelope); + } + } catch { + // Leave stdout untouched — parseEnvelope will handle it / trigger a retry. + } + } + + return result; } async extractLog(client: MicrosandboxClient): Promise { @@ -58,6 +106,62 @@ export class ClaudeAdapter extends BaseAdapter { } } + /** + * Install Claude Code plugins for the executor's CLI session. + * + * We extract each plugin directory under `$HOME/.claude/plugins//` + * and remember the paths in `installedPluginDirs`. `sandboxCommand()` then + * emits `--plugin-dir ` flags so the CLI loads each plugin for the + * run. This is the documented "load a local plugin for this session" + * mechanism (see Claude Code CLI reference); no marketplace registration, + * settings.json edits, or trust prompts are needed — important because + * trust prompts cannot be answered in `--print` mode. + * + * Plugins are validated host-side first: missing `.claude-plugin/plugin.json` + * fails fast with a clear error. + */ + async installPluginsInSandbox( + client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + + // Validate every plugin host-side before doing any sandbox work. + for (const plugin of plugins) { + const manifestPath = join(plugin.hostDir, '.claude-plugin', 'plugin.json'); + try { + await access(manifestPath); + } catch { + throw new Error( + `Plugin '${plugin.name}' is missing the Claude manifest at ${manifestPath}. ` + + `Each Claude Code plugin must contain a .claude-plugin/plugin.json file.`, + ); + } + } + + // Target images run as root in the existing framework (no user override + // surface in microsandbox config — see src/sandbox/microsandbox.ts). + // Hardcoding /root avoids brittleness around how the sandbox shell + // expands $HOME, which we've seen return / instead of /root on some + // images. + const pluginsRoot = '/root/.claude/plugins'; + + const setup = await client.runCommand(`mkdir -p '${pluginsRoot}'`); + if (setup.exitCode !== 0) { + throw new Error( + `Failed to prepare ${pluginsRoot} in sandbox: ${setup.stderr || setup.stdout}`, + ); + } + + const installedDirs: string[] = []; + for (const plugin of plugins) { + const destDir = `${pluginsRoot}/${plugin.name}`; + await uploadDirToSandbox(client, plugin.hostDir, destDir, `plugin_${plugin.name}`); + installedDirs.push(destDir); + } + this.installedPluginDirs = installedDirs; + } + protected parseEnvelope(result: AgentResult): AgentResult | null { try { const envelope = JSON.parse(result.stdout); diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 3ce6670..93ebaee 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -1,8 +1,9 @@ -import { writeFile, readFile, rm } from 'node:fs/promises'; +import { writeFile, readFile, rm, access, readdir, stat as fsStat } from 'node:fs/promises'; import { join } from 'node:path'; import { tmpdir } from 'node:os'; -import type { AgentConfig, AgentResult } from '../types.js'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; +import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; export class CodexAdapter extends BaseAdapter { @@ -95,6 +96,107 @@ export class CodexAdapter extends BaseAdapter { return result; } + /** + * Install plugin skills into the Codex CLI's auto-discovered skills dir. + * + * Codex doesn't have a "plugin" abstraction like Claude's marketplaces; it + * auto-discovers individual skills under `${CODEX_HOME:-$HOME/.codex}/skills//`. + * For each plugin we iterate its `skills/` subtree and lay out each + * SKILL.md-bearing directory at `$CODEX_HOME/skills//`. + * + * A plugin contributes nothing to Codex if it has no `.codex-plugin/plugin.json` + * manifest, or if it has no `skills/` subdirectory with at least one + * SKILL.md. We fail fast in that case so the user knows the A/B comparison + * won't actually exercise the plugin. + */ + async installPluginsInSandbox( + client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + + type SkillInstall = { srcDir: string; skillName: string }; + const skillsToInstall: Array<{ plugin: string; install: SkillInstall }> = []; + + for (const plugin of plugins) { + const manifestPath = join(plugin.hostDir, '.codex-plugin', 'plugin.json'); + try { + await access(manifestPath); + } catch { + throw new Error( + `Plugin '${plugin.name}' is missing the Codex manifest at ${manifestPath}. ` + + `Codex executors need each plugin to ship a .codex-plugin/plugin.json file.`, + ); + } + + const skillsDir = join(plugin.hostDir, 'skills'); + let entries; + try { + entries = await readdir(skillsDir, { withFileTypes: true }); + } catch { + throw new Error( + `Plugin '${plugin.name}' has no 'skills/' directory at ${skillsDir}. ` + + `Codex auto-discovers skills from individual subdirectories — bundle each as /skills//SKILL.md.`, + ); + } + + let pluginContributed = false; + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const entryName = String(entry.name); + const skillDir = join(skillsDir, entryName); + const skillFile = join(skillDir, 'SKILL.md'); + try { + const skillStat = await fsStat(skillFile); + if (!skillStat.isFile()) continue; + } catch { + continue; + } + skillsToInstall.push({ + plugin: plugin.name, + install: { srcDir: skillDir, skillName: entryName }, + }); + pluginContributed = true; + } + + if (!pluginContributed) { + throw new Error( + `Plugin '${plugin.name}' contains no usable Codex skills (no //SKILL.md). ` + + `Each plugin must contribute at least one SKILL.md file under its skills/ subdirectory.`, + ); + } + } + + const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); + const codexHome = homeResult.stdout.trim() || '/root/.codex'; + const codexSkillsDir = `${codexHome}/skills`; + + const setup = await client.runCommand(`mkdir -p '${codexSkillsDir}'`); + if (setup.exitCode !== 0) { + throw new Error( + `Failed to prepare ${codexSkillsDir} in sandbox: ${setup.stderr || setup.stdout}`, + ); + } + + const seenSkillNames = new Set(); + for (const { plugin, install } of skillsToInstall) { + if (seenSkillNames.has(install.skillName)) { + throw new Error( + `Skill '${install.skillName}' is contributed by more than one plugin (latest: '${plugin}'). ` + + `Codex requires skill names to be unique across all installed plugins.`, + ); + } + seenSkillNames.add(install.skillName); + const destDir = `${codexSkillsDir}/${install.skillName}`; + await uploadDirToSandbox( + client, + install.srcDir, + destDir, + `codex_skill_${install.skillName}`, + ); + } + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.codex/sessions/*.jsonl' -type f 2>/dev/null | sort | tail -1", diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index 9de4f88..fb875eb 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -1,5 +1,8 @@ -import type { AgentConfig, AgentResult } from '../types.js'; +import { access } from 'node:fs/promises'; +import { join } from 'node:path'; +import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; import type { MicrosandboxClient } from '../sandbox/microsandbox.js'; +import { uploadDirToSandbox } from '../sandbox/scaffolding.js'; import { BaseAdapter } from './base.js'; export class GeminiAdapter extends BaseAdapter { @@ -40,6 +43,56 @@ export class GeminiAdapter extends BaseAdapter { return this.spawn(args, workDir, env, undefined, prompt); } + /** + * Install plugin directories into the Gemini CLI's extensions folder. + * + * Gemini's extension model expects each extension dir to contain a + * `gemini-extension.json` manifest at its root (see Gemini CLI docs: + * Extensions reference). Auto-discovery loads extensions from + * `${GEMINI_HOME:-$HOME/.gemini}/extensions//`. + * + * For each plugin we look for `gemini-extension.json` at the plugin root. + * If present, the entire plugin directory is treated as a Gemini extension + * and laid out at the expected location. If absent, we throw — the plugin + * has nothing the Gemini CLI knows how to load, and an A/B comparison that + * silently no-ops would be misleading. + */ + async installPluginsInSandbox( + client: MicrosandboxClient, + plugins: ResolvedExecutorPlugin[], + ): Promise { + if (plugins.length === 0) return; + + for (const plugin of plugins) { + const manifestPath = join(plugin.hostDir, 'gemini-extension.json'); + try { + await access(manifestPath); + } catch { + throw new Error( + `Plugin '${plugin.name}' has no Gemini extension manifest at ${manifestPath}. ` + + `Gemini CLI loads extensions from a 'gemini-extension.json' file at the extension root. ` + + `Add one to the plugin directory or remove '${plugin.name}' from executorPlugins when running the Gemini executor.`, + ); + } + } + + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; + const extensionsDir = `${home}/.gemini/extensions`; + + const setup = await client.runCommand(`mkdir -p '${extensionsDir}'`); + if (setup.exitCode !== 0) { + throw new Error( + `Failed to prepare ${extensionsDir} in sandbox: ${setup.stderr || setup.stdout}`, + ); + } + + for (const plugin of plugins) { + const destDir = `${extensionsDir}/${plugin.name}`; + await uploadDirToSandbox(client, plugin.hostDir, destDir, `gemini_ext_${plugin.name}`); + } + } + async extractLog(client: MicrosandboxClient): Promise { const result = await client.runCommand( "find / -path '*/.gemini/tmp/*/chats/session-*.jsonl' -type f 2>/dev/null | sort | tail -1", diff --git a/src/commands/execute.ts b/src/commands/execute.ts index af24ee0..851357f 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -5,7 +5,7 @@ import { loadConfig } from '../core/config.js'; import { loadTestSuite, saveResult, saveBinaryResult, formatElapsed } from '../core/suite-io.js'; import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; -import { scaffoldWorkspace } from '../sandbox/scaffolding.js'; +import { scaffoldWorkspace, resolveExecutorPlugins } from '../sandbox/scaffolding.js'; import { WorkerPool } from '../sandbox/worker-pool.js'; import { createAdapter } from '../agents/adapter.js'; import { getPackageSource, getUrlSources, getFileSources } from '../types.js'; @@ -203,6 +203,23 @@ export async function executeTestCase( } } + // Install executor plugins (Claude Code marketplace, Codex skills, Gemini extensions). + // Plugins go AFTER the agent CLI install so any per-CLI plugin paths exist, and + // BEFORE the agent runs so the plugin manifest is in place when the CLI boots. + // Plugins are deliberately not installed in the judge sandbox — keeping the + // judge's environment independent of the executor's tooling is what makes the + // A/B comparison meaningful. + if (config.executorPlugins && config.executorPlugins.length > 0) { + const resolvedPlugins = await resolveExecutorPlugins(config.executorPlugins, paths.cacheRepos); + try { + await adapter.installPluginsInSandbox(client, resolvedPlugins); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + await saveResult(paths, testCase.id, 'plugin-install-error.log', message, target.name); + throw new Error(`Executor plugin install failed: ${message}`); + } + } + // Upload public file sources (docs directories, etc.) for the executor const publicFileSources = getFileSources(config.publicInfo ?? []); const publicSourceDirs = publicFileSources.length > 0 diff --git a/src/core/__tests__/config.test.ts b/src/core/__tests__/config.test.ts index 61fe7da..f351408 100644 --- a/src/core/__tests__/config.test.ts +++ b/src/core/__tests__/config.test.ts @@ -223,4 +223,96 @@ describe('loadConfig', () => { mockReadFile.mockResolvedValue(JSON.stringify(config)); await expect(loadConfig('/fake/config.json')).rejects.toThrow(/valid URL/); }); + + describe('executorPlugins', () => { + it('accepts a single local plugin', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', name: 'my-plugin', path: '/tmp/my-plugin' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorPlugins).toHaveLength(1); + expect(result.executorPlugins?.[0]).toMatchObject({ type: 'local', name: 'my-plugin', path: '/tmp/my-plugin' }); + }); + + it('accepts a git plugin with branch + subpath', async () => { + const config = { + ...validConfig, + executorPlugins: [{ + type: 'git', + name: 'shared', + url: 'https://example.com/repo.git', + branch: 'main', + subpath: 'plugins/shared', + }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.executorPlugins).toHaveLength(1); + }); + + it('throws when executorPlugins is not an array', async () => { + const config = { ...validConfig, executorPlugins: 'not-an-array' }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/executorPlugins must be an array/); + }); + + it('throws when a plugin entry is missing name', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', path: '/tmp/x' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/name/); + }); + + it('throws when a plugin entry has an invalid name with shell-special chars', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', name: 'bad name; rm -rf /', path: '/tmp/x' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/unsupported characters/); + }); + + it('throws when a plugin entry has an invalid type', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'url', name: 'a', url: 'https://x.com' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/invalid/i); + }); + + it('throws when a local plugin is missing path', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'local', name: 'a' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/path/); + }); + + it('throws when a git plugin is missing url', async () => { + const config = { + ...validConfig, + executorPlugins: [{ type: 'git', name: 'a' }], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/url/); + }); + + it('throws when two plugins share the same name', async () => { + const config = { + ...validConfig, + executorPlugins: [ + { type: 'local', name: 'a', path: '/tmp/a' }, + { type: 'local', name: 'a', path: '/tmp/b' }, + ], + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/duplicated/); + }); + }); }); diff --git a/src/core/config.ts b/src/core/config.ts index 6f2df48..78a29cf 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -24,6 +24,47 @@ export async function loadConfig(configPath: string): Promise { const VALID_SOURCE_TYPES = ['local', 'git', 'url', 'package']; +const VALID_PLUGIN_TYPES = ['local', 'git']; + +function validateExecutorPluginEntry(plugin: Record, prefix: string): void { + if (!plugin || typeof plugin !== 'object' || Array.isArray(plugin)) { + throw new Error(`${prefix} must be an object`); + } + + if (!plugin.name || typeof plugin.name !== 'string') { + throw new Error(`${prefix} requires a non-empty 'name' string`); + } + if (!/^[a-zA-Z0-9._-]+$/.test(plugin.name)) { + throw new Error( + `${prefix}.name '${plugin.name}' contains unsupported characters. ` + + `Use only letters, digits, '.', '_', or '-'.`, + ); + } + + if (!plugin.type || typeof plugin.type !== 'string') { + throw new Error(`${prefix} missing required field: type`); + } + + if (!VALID_PLUGIN_TYPES.includes(plugin.type)) { + throw new Error( + `${prefix}.type '${plugin.type}' is invalid. Must be one of: ${VALID_PLUGIN_TYPES.map(t => `'${t}'`).join(', ')}` + ); + } + + switch (plugin.type) { + case 'local': + if (!plugin.path || typeof plugin.path !== 'string') { + throw new Error(`${prefix} type 'local' requires path to be set`); + } + break; + case 'git': + if (!plugin.url || typeof plugin.url !== 'string') { + throw new Error(`${prefix} type 'git' requires url to be set`); + } + break; + } +} + function validateSourceEntry(source: Record, prefix: string): void { if (!source || typeof source !== 'object' || Array.isArray(source)) { throw new Error(`${prefix} must be an object`); @@ -89,6 +130,23 @@ export function validateConfig(data: unknown, configPath?: string): Config { } } + // Validate executorPlugins (optional) + if (obj.executorPlugins !== undefined) { + if (!Array.isArray(obj.executorPlugins)) { + throw new Error('Config executorPlugins must be an array'); + } + const seenNames = new Set(); + for (let i = 0; i < obj.executorPlugins.length; i++) { + const entry = obj.executorPlugins[i] as Record; + validateExecutorPluginEntry(entry, `executorPlugins[${i}]`); + const name = entry.name as string; + if (seenNames.has(name)) { + throw new Error(`executorPlugins[${i}].name '${name}' is duplicated`); + } + seenNames.add(name); + } + } + // Validate targets if (!Array.isArray(obj.targets) || obj.targets.length === 0) { throw new Error('Config requires at least one target in targets array'); diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index b36c57a..d88f16b 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -1,10 +1,10 @@ import { readFile, readdir, stat as fsStat } from 'node:fs/promises'; import { execFile } from 'node:child_process'; -import { basename, join, relative } from 'node:path'; +import { basename, join, relative, resolve as resolvePath } from 'node:path'; import { tmpdir } from 'node:os'; import type { MicrosandboxClient } from './microsandbox.js'; -import type { Config, TestCase, SourceConfig } from '../types.js'; -import { resolveSources } from '../core/source-resolver.js'; +import type { Config, TestCase, SourceConfig, ExecutorPlugin, ResolvedExecutorPlugin } from '../types.js'; +import { resolveSources, resolveSource } from '../core/source-resolver.js'; /** Directories excluded from source uploads — these are large and not useful for evaluation. */ const EXCLUDED_DIRS = [ @@ -82,7 +82,7 @@ async function readDirRecursive( * Create a tar.gz archive of a directory, excluding common bloat dirs. * Archives are cached on disk so concurrent sandboxes reuse the same tarball. */ -async function getSourceArchive(srcPath: string): Promise { +export async function getSourceArchive(srcPath: string): Promise { const cached = sourceArchiveCache.get(srcPath); if (cached) { try { @@ -115,6 +115,35 @@ async function getSourceArchive(srcPath: string): Promise { return tarPath; } +/** + * Tar a host directory, upload the archive to the sandbox, and extract it + * into `sandboxDestDir`. Used for any "copy this directory tree to a known + * path inside the VM" operation (source uploads, plugin installs). + * + * `archiveLabel` should be a slug-safe identifier (used only to name the + * temporary tarball path inside the sandbox). + */ +export async function uploadDirToSandbox( + client: MicrosandboxClient, + hostDir: string, + sandboxDestDir: string, + archiveLabel: string, +): Promise { + const tarPath = await getSourceArchive(hostDir); + const tarData = await readFile(tarPath); + const sandboxTarPath = `/tmp/_${archiveLabel}.tar.gz`; + await client.uploadBinaryFile(sandboxTarPath, tarData); + const result = await client.runCommand( + `mkdir -p '${sandboxDestDir}' && tar xzf '${sandboxTarPath}' -C '${sandboxDestDir}' && rm -f '${sandboxTarPath}'`, + ); + if (result.exitCode !== 0) { + throw new Error( + `Failed to extract '${hostDir}' into sandbox at '${sandboxDestDir}': ` + + `${result.stderr || result.stdout}`, + ); + } +} + /** * Scaffolds a sandbox workspace with up to 3 optional layers: * - Layer 2 (Template): uploads local template directory to /workspace/ @@ -238,15 +267,48 @@ export async function uploadSources( const targetPrefix = `/workspace/sources/${dirName}/`; sandboxDirs.push(targetPrefix); - const tarPath = await getSourceArchive(srcPath); - const tarData = await readFile(tarPath); - const sandboxTarPath = `/tmp/_sources_${dirName}.tar.gz`; - await client.uploadBinaryFile(sandboxTarPath, tarData); - await client.runCommand( - `mkdir -p '${targetPrefix}' && tar xzf '${sandboxTarPath}' -C '${targetPrefix}' && rm -f '${sandboxTarPath}'`, - ); + await uploadDirToSandbox(client, srcPath, targetPrefix, `sources_${dirName}`); } } return sandboxDirs; } + +/** + * Resolve `config.executorPlugins` to host-side directories. Reuses the + * existing source-resolver for git clones (cached under `cacheRepos`) so + * repeat runs don't re-clone the same plugin source. + * + * Returns an empty array when no plugins are configured. + */ +export async function resolveExecutorPlugins( + plugins: ExecutorPlugin[] | undefined, + cacheRepos: string, +): Promise { + if (!plugins || plugins.length === 0) return []; + + const resolved: ResolvedExecutorPlugin[] = []; + for (const plugin of plugins) { + if (plugin.type === 'local') { + // Reuse resolveSource for consistent path semantics (resolves relative paths against cwd). + const hostDir = await resolveSource( + { type: 'local', path: plugin.path }, + { reposDir: cacheRepos }, + ); + resolved.push({ name: plugin.name, hostDir: resolvePath(hostDir) }); + } else { + const hostDir = await resolveSource( + { + type: 'git', + url: plugin.url, + branch: plugin.branch, + subpath: plugin.subpath, + sparse: plugin.sparse, + }, + { reposDir: cacheRepos }, + ); + resolved.push({ name: plugin.name, hostDir: resolvePath(hostDir) }); + } + } + return resolved; +} diff --git a/src/scoring/__tests__/judge.test.ts b/src/scoring/__tests__/judge.test.ts index 87c5b1e..798397a 100644 --- a/src/scoring/__tests__/judge.test.ts +++ b/src/scoring/__tests__/judge.test.ts @@ -71,6 +71,7 @@ function makeMockAdapter(opts: { stdout: string }) { sandboxCommand: vi.fn().mockReturnValue('claude --print "prompt"'), extractResult: vi.fn().mockImplementation((stdout: string) => stdout), extractLog: vi.fn().mockResolvedValue(null), + installPluginsInSandbox: vi.fn().mockResolvedValue(undefined), }; } diff --git a/src/types.ts b/src/types.ts index c3dae8f..567c279 100644 --- a/src/types.ts +++ b/src/types.ts @@ -129,6 +129,41 @@ export interface WorkspaceConfig { setupScript?: string; } +/** + * Source of a plugin directory tree to install into the executor's agent CLI. + * Resolved to a local filesystem path by the source resolver; the adapter then + * lays the tree out wherever its CLI expects to find plugins. + */ +export interface LocalExecutorPlugin { + type: 'local'; + /** Plugin slug — used as the directory name under the CLI's plugins dir. */ + name: string; + /** Absolute or relative path to a directory containing the adapter-specific plugin manifest. */ + path: string; +} + +export interface GitExecutorPlugin { + type: 'git'; + name: string; + url: string; + branch?: string; + /** Path within the cloned repo that contains the plugin manifest. */ + subpath?: string; + sparse?: string[]; +} + +export type ExecutorPlugin = LocalExecutorPlugin | GitExecutorPlugin; + +/** + * An ExecutorPlugin after host-side resolution. The adapter receives this and + * decides how to install it inside the sandbox VM. + */ +export interface ResolvedExecutorPlugin { + name: string; + /** Absolute path on the host to the plugin directory. */ + hostDir: string; +} + export interface SecretConfig { /** Raw value or "$ENV_VAR" reference resolved from host environment. */ value: string; @@ -161,6 +196,14 @@ export interface Config { }; targets: TargetConfig[]; workspace?: WorkspaceConfig; + /** + * Plugin directories to install into the executor agent's CLI inside the sandbox VM. + * Each adapter knows where its CLI expects plugins on disk (Claude: `~/.claude/plugins/`, + * Codex: `~/.codex/plugins/`, Gemini: not yet supported in non-interactive mode). + * Plugins are scoped to the executor — the judge sandbox is intentionally not seeded + * with these so judge scoring stays independent of the executor's tooling. + */ + executorPlugins?: ExecutorPlugin[]; sandbox: SandboxConfig; } From f71724e575caaad4920bce3fefc14a66c112756a Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 12:58:16 +1200 Subject: [PATCH 2/5] refactor(plugins): parallelize per-plugin install, drop redundant prep work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Validate manifests and upload plugin dirs concurrently across plugins - Drop pre-emptive `mkdir -p` calls: uploadDirToSandbox already mkdirs the destination as part of the tar-extract command - Hardcode `/root/.codex/skills` and `/root/.gemini/extensions` (matching ClaudeAdapter); removes a per-install `printf HOME` RPC roundtrip in each - Collapse resolveExecutorPlugins to a single SourceConfig-mapping branch - Rename the `stat as fsStat` import alias back to `stat` by renaming the local var (skillStat → md) - Trim narrative block comments --- src/agents/claude.ts | 41 +++++-------------- src/agents/codex.ts | 83 +++++++++++++++++--------------------- src/agents/gemini.ts | 28 ++++++------- src/commands/execute.ts | 8 +--- src/sandbox/scaffolding.ts | 27 ++++--------- 5 files changed, 71 insertions(+), 116 deletions(-) diff --git a/src/agents/claude.ts b/src/agents/claude.ts index a5b1ded..3a576b2 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -109,16 +109,10 @@ export class ClaudeAdapter extends BaseAdapter { /** * Install Claude Code plugins for the executor's CLI session. * - * We extract each plugin directory under `$HOME/.claude/plugins//` - * and remember the paths in `installedPluginDirs`. `sandboxCommand()` then - * emits `--plugin-dir ` flags so the CLI loads each plugin for the - * run. This is the documented "load a local plugin for this session" - * mechanism (see Claude Code CLI reference); no marketplace registration, - * settings.json edits, or trust prompts are needed — important because - * trust prompts cannot be answered in `--print` mode. - * - * Plugins are validated host-side first: missing `.claude-plugin/plugin.json` - * fails fast with a clear error. + * Each plugin is extracted under `/root/.claude/plugins//` and the + * resulting paths are stashed for `sandboxCommand()` to emit as + * `--plugin-dir ` flags. No marketplace registration or trust + * prompt — those can't be answered in `--print` mode. */ async installPluginsInSandbox( client: MicrosandboxClient, @@ -126,8 +120,7 @@ export class ClaudeAdapter extends BaseAdapter { ): Promise { if (plugins.length === 0) return; - // Validate every plugin host-side before doing any sandbox work. - for (const plugin of plugins) { + await Promise.all(plugins.map(async (plugin) => { const manifestPath = join(plugin.hostDir, '.claude-plugin', 'plugin.json'); try { await access(manifestPath); @@ -137,29 +130,17 @@ export class ClaudeAdapter extends BaseAdapter { `Each Claude Code plugin must contain a .claude-plugin/plugin.json file.`, ); } - } + })); - // Target images run as root in the existing framework (no user override - // surface in microsandbox config — see src/sandbox/microsandbox.ts). - // Hardcoding /root avoids brittleness around how the sandbox shell - // expands $HOME, which we've seen return / instead of /root on some - // images. + // Target images run as root; hardcoding /root avoids brittleness around + // how the sandbox shell expands $HOME (some images return / instead). const pluginsRoot = '/root/.claude/plugins'; - const setup = await client.runCommand(`mkdir -p '${pluginsRoot}'`); - if (setup.exitCode !== 0) { - throw new Error( - `Failed to prepare ${pluginsRoot} in sandbox: ${setup.stderr || setup.stdout}`, - ); - } - - const installedDirs: string[] = []; - for (const plugin of plugins) { + this.installedPluginDirs = await Promise.all(plugins.map(async (plugin) => { const destDir = `${pluginsRoot}/${plugin.name}`; await uploadDirToSandbox(client, plugin.hostDir, destDir, `plugin_${plugin.name}`); - installedDirs.push(destDir); - } - this.installedPluginDirs = installedDirs; + return destDir; + })); } protected parseEnvelope(result: AgentResult): AgentResult | null { diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 93ebaee..dc86c3c 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -1,4 +1,4 @@ -import { writeFile, readFile, rm, access, readdir, stat as fsStat } from 'node:fs/promises'; +import { writeFile, readFile, rm, access, readdir, stat } from 'node:fs/promises'; import { join } from 'node:path'; import { tmpdir } from 'node:os'; import type { AgentConfig, AgentResult, ResolvedExecutorPlugin } from '../types.js'; @@ -115,10 +115,9 @@ export class CodexAdapter extends BaseAdapter { ): Promise { if (plugins.length === 0) return; - type SkillInstall = { srcDir: string; skillName: string }; - const skillsToInstall: Array<{ plugin: string; install: SkillInstall }> = []; + type SkillInstall = { plugin: string; srcDir: string; skillName: string }; - for (const plugin of plugins) { + const perPluginSkills = await Promise.all(plugins.map(async (plugin): Promise => { const manifestPath = join(plugin.hostDir, '.codex-plugin', 'plugin.json'); try { await access(manifestPath); @@ -140,61 +139,55 @@ export class CodexAdapter extends BaseAdapter { ); } - let pluginContributed = false; - for (const entry of entries) { - if (!entry.isDirectory()) continue; - const entryName = String(entry.name); - const skillDir = join(skillsDir, entryName); - const skillFile = join(skillDir, 'SKILL.md'); - try { - const skillStat = await fsStat(skillFile); - if (!skillStat.isFile()) continue; - } catch { - continue; - } - skillsToInstall.push({ - plugin: plugin.name, - install: { srcDir: skillDir, skillName: entryName }, - }); - pluginContributed = true; - } + const skillCandidates = await Promise.all(entries + .filter((entry) => entry.isDirectory()) + .map(async (entry): Promise => { + const skillName = String(entry.name); + const skillDir = join(skillsDir, skillName); + try { + const md = await stat(join(skillDir, 'SKILL.md')); + if (!md.isFile()) return null; + } catch { + return null; + } + return { plugin: plugin.name, srcDir: skillDir, skillName }; + }), + ); + const skills = skillCandidates.filter((s): s is SkillInstall => s !== null); - if (!pluginContributed) { + if (skills.length === 0) { throw new Error( `Plugin '${plugin.name}' contains no usable Codex skills (no //SKILL.md). ` + `Each plugin must contribute at least one SKILL.md file under its skills/ subdirectory.`, ); } - } + return skills; + })); - const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); - const codexHome = homeResult.stdout.trim() || '/root/.codex'; - const codexSkillsDir = `${codexHome}/skills`; - - const setup = await client.runCommand(`mkdir -p '${codexSkillsDir}'`); - if (setup.exitCode !== 0) { - throw new Error( - `Failed to prepare ${codexSkillsDir} in sandbox: ${setup.stderr || setup.stdout}`, - ); - } + // Target images run as root; matches the hardcoded path in ClaudeAdapter. + const codexSkillsDir = '/root/.codex/skills'; const seenSkillNames = new Set(); - for (const { plugin, install } of skillsToInstall) { - if (seenSkillNames.has(install.skillName)) { + const installs: SkillInstall[] = []; + for (const skill of perPluginSkills.flat()) { + if (seenSkillNames.has(skill.skillName)) { throw new Error( - `Skill '${install.skillName}' is contributed by more than one plugin (latest: '${plugin}'). ` + + `Skill '${skill.skillName}' is contributed by more than one plugin (latest: '${skill.plugin}'). ` + `Codex requires skill names to be unique across all installed plugins.`, ); } - seenSkillNames.add(install.skillName); - const destDir = `${codexSkillsDir}/${install.skillName}`; - await uploadDirToSandbox( - client, - install.srcDir, - destDir, - `codex_skill_${install.skillName}`, - ); + seenSkillNames.add(skill.skillName); + installs.push(skill); } + + await Promise.all(installs.map((skill) => + uploadDirToSandbox( + client, + skill.srcDir, + `${codexSkillsDir}/${skill.skillName}`, + `codex_skill_${skill.skillName}`, + ), + )); } async extractLog(client: MicrosandboxClient): Promise { diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index fb875eb..f0eb9c2 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -63,7 +63,7 @@ export class GeminiAdapter extends BaseAdapter { ): Promise { if (plugins.length === 0) return; - for (const plugin of plugins) { + await Promise.all(plugins.map(async (plugin) => { const manifestPath = join(plugin.hostDir, 'gemini-extension.json'); try { await access(manifestPath); @@ -74,23 +74,19 @@ export class GeminiAdapter extends BaseAdapter { `Add one to the plugin directory or remove '${plugin.name}' from executorPlugins when running the Gemini executor.`, ); } - } - - const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); - const home = homeResult.stdout.trim() || '/root'; - const extensionsDir = `${home}/.gemini/extensions`; + })); - const setup = await client.runCommand(`mkdir -p '${extensionsDir}'`); - if (setup.exitCode !== 0) { - throw new Error( - `Failed to prepare ${extensionsDir} in sandbox: ${setup.stderr || setup.stdout}`, - ); - } + // Target images run as root; matches the hardcoded path in ClaudeAdapter. + const extensionsDir = '/root/.gemini/extensions'; - for (const plugin of plugins) { - const destDir = `${extensionsDir}/${plugin.name}`; - await uploadDirToSandbox(client, plugin.hostDir, destDir, `gemini_ext_${plugin.name}`); - } + await Promise.all(plugins.map((plugin) => + uploadDirToSandbox( + client, + plugin.hostDir, + `${extensionsDir}/${plugin.name}`, + `gemini_ext_${plugin.name}`, + ), + )); } async extractLog(client: MicrosandboxClient): Promise { diff --git a/src/commands/execute.ts b/src/commands/execute.ts index 851357f..9be3f7a 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -203,12 +203,8 @@ export async function executeTestCase( } } - // Install executor plugins (Claude Code marketplace, Codex skills, Gemini extensions). - // Plugins go AFTER the agent CLI install so any per-CLI plugin paths exist, and - // BEFORE the agent runs so the plugin manifest is in place when the CLI boots. - // Plugins are deliberately not installed in the judge sandbox — keeping the - // judge's environment independent of the executor's tooling is what makes the - // A/B comparison meaningful. + // Plugins are intentionally executor-only — the judge sandbox stays + // plugin-free so its scoring is independent of the executor's tooling. if (config.executorPlugins && config.executorPlugins.length > 0) { const resolvedPlugins = await resolveExecutorPlugins(config.executorPlugins, paths.cacheRepos); try { diff --git a/src/sandbox/scaffolding.ts b/src/sandbox/scaffolding.ts index d88f16b..eabb9dc 100644 --- a/src/sandbox/scaffolding.ts +++ b/src/sandbox/scaffolding.ts @@ -287,28 +287,17 @@ export async function resolveExecutorPlugins( ): Promise { if (!plugins || plugins.length === 0) return []; - const resolved: ResolvedExecutorPlugin[] = []; - for (const plugin of plugins) { - if (plugin.type === 'local') { - // Reuse resolveSource for consistent path semantics (resolves relative paths against cwd). - const hostDir = await resolveSource( - { type: 'local', path: plugin.path }, - { reposDir: cacheRepos }, - ); - resolved.push({ name: plugin.name, hostDir: resolvePath(hostDir) }); - } else { - const hostDir = await resolveSource( - { + return Promise.all(plugins.map(async (plugin) => { + const source: SourceConfig = plugin.type === 'local' + ? { type: 'local', path: plugin.path } + : { type: 'git', url: plugin.url, branch: plugin.branch, subpath: plugin.subpath, sparse: plugin.sparse, - }, - { reposDir: cacheRepos }, - ); - resolved.push({ name: plugin.name, hostDir: resolvePath(hostDir) }); - } - } - return resolved; + }; + const hostDir = await resolveSource(source, { reposDir: cacheRepos }); + return { name: plugin.name, hostDir: resolvePath(hostDir) }; + })); } From 60d4f2f032416ed60b8c8da54b5840e1c7e109a0 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 13:45:21 +1200 Subject: [PATCH 3/5] fix(plugins): derive sandbox paths from $HOME; name both plugins on skill collision - Restore runtime $HOME / $CODEX_HOME probe in each adapter so install destinations match the documented `$HOME/.claude/plugins/`, `$CODEX_HOME/skills/`, `$HOME/.gemini/extensions/` layout. Pre-cleanup, codex/gemini already used the probe; the cleanup hardcoded /root for them and applied the same hardcoding to claude. This restores $HOME derivation for all three with a `/root` shell-side fallback. - Codex: replace the seenSkillNames Set with a Map tracking the originating plugin per skill name; on collision the error now names both plugins instead of only the latest. Reviewer-flagged. --- src/agents/__tests__/codex.test.ts | 4 ++-- src/agents/claude.ts | 8 ++++---- src/agents/codex.ts | 14 ++++++++------ src/agents/gemini.ts | 5 +++-- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/agents/__tests__/codex.test.ts b/src/agents/__tests__/codex.test.ts index 7fab352..b42b4f6 100644 --- a/src/agents/__tests__/codex.test.ts +++ b/src/agents/__tests__/codex.test.ts @@ -213,7 +213,7 @@ describe('CodexAdapter', () => { ); }); - it('throws when two plugins contribute the same skill name', async () => { + it('throws when two plugins contribute the same skill name, naming both', async () => { mockAccess.mockResolvedValue(undefined); // Two plugins, each contributing a skill called 'shared'. mockReaddir @@ -227,7 +227,7 @@ describe('CodexAdapter', () => { await expect(adapter.installPluginsInSandbox(client as any, [ { name: 'plugin-a', hostDir: '/tmp/a' }, { name: 'plugin-b', hostDir: '/tmp/b' }, - ])).rejects.toThrow(/contributed by more than one plugin/); + ])).rejects.toThrow(/'plugin-a'.*'plugin-b'/); }); }); }); diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 3a576b2..47d77fc 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -109,7 +109,7 @@ export class ClaudeAdapter extends BaseAdapter { /** * Install Claude Code plugins for the executor's CLI session. * - * Each plugin is extracted under `/root/.claude/plugins//` and the + * Each plugin is extracted under `$HOME/.claude/plugins//` and the * resulting paths are stashed for `sandboxCommand()` to emit as * `--plugin-dir ` flags. No marketplace registration or trust * prompt — those can't be answered in `--print` mode. @@ -132,9 +132,9 @@ export class ClaudeAdapter extends BaseAdapter { } })); - // Target images run as root; hardcoding /root avoids brittleness around - // how the sandbox shell expands $HOME (some images return / instead). - const pluginsRoot = '/root/.claude/plugins'; + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; + const pluginsRoot = `${home}/.claude/plugins`; this.installedPluginDirs = await Promise.all(plugins.map(async (plugin) => { const destDir = `${pluginsRoot}/${plugin.name}`; diff --git a/src/agents/codex.ts b/src/agents/codex.ts index dc86c3c..0997e72 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -164,19 +164,21 @@ export class CodexAdapter extends BaseAdapter { return skills; })); - // Target images run as root; matches the hardcoded path in ClaudeAdapter. - const codexSkillsDir = '/root/.codex/skills'; + const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); + const codexHome = homeResult.stdout.trim() || '/root/.codex'; + const codexSkillsDir = `${codexHome}/skills`; - const seenSkillNames = new Set(); + const skillOwners = new Map(); const installs: SkillInstall[] = []; for (const skill of perPluginSkills.flat()) { - if (seenSkillNames.has(skill.skillName)) { + const prior = skillOwners.get(skill.skillName); + if (prior !== undefined) { throw new Error( - `Skill '${skill.skillName}' is contributed by more than one plugin (latest: '${skill.plugin}'). ` + + `Skill '${skill.skillName}' is contributed by both '${prior}' and '${skill.plugin}'. ` + `Codex requires skill names to be unique across all installed plugins.`, ); } - seenSkillNames.add(skill.skillName); + skillOwners.set(skill.skillName, skill.plugin); installs.push(skill); } diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index f0eb9c2..03d3dcc 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -76,8 +76,9 @@ export class GeminiAdapter extends BaseAdapter { } })); - // Target images run as root; matches the hardcoded path in ClaudeAdapter. - const extensionsDir = '/root/.gemini/extensions'; + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; + const extensionsDir = `${home}/.gemini/extensions`; await Promise.all(plugins.map((plugin) => uploadDirToSandbox( From c52467839902b1727a3e0a04e512fb43861e67d4 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 17:54:54 +1200 Subject: [PATCH 4/5] fix(plugins): fall back to /root when sandbox $HOME degenerates to / Some base images (e.g. node:20-slim) export HOME=/ for the root user. Deriving the plugin install dir straight from $HOME then lands plugins in top-level dot-dirs (/.claude/plugins, /.codex/skills, /.gemini/extensions). Treat a probed $HOME of '/' (or empty) as degenerate and fall back to /root, so plugins install under /root/.{claude,codex,gemini} as expected. Codex now probes $CODEX_HOME and $HOME separately so $CODEX_HOME is still honoured when set. Verified end-to-end against microsandbox node:20-slim VMs. --- src/agents/claude.ts | 6 ++++-- src/agents/codex.ts | 10 ++++++++-- src/agents/gemini.ts | 6 ++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 47d77fc..109ec4d 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -132,8 +132,10 @@ export class ClaudeAdapter extends BaseAdapter { } })); - const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); - const home = homeResult.stdout.trim() || '/root'; + // Some base images (e.g. node:20-slim) expand $HOME to '/' for root. + // Fall back to /root so plugins don't land in top-level dot-dirs. + const probedHome = (await client.runCommand('printf %s "${HOME:-/root}"')).stdout.trim(); + const home = probedHome && probedHome !== '/' ? probedHome : '/root'; const pluginsRoot = `${home}/.claude/plugins`; this.installedPluginDirs = await Promise.all(plugins.map(async (plugin) => { diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 0997e72..14dd5e2 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -164,8 +164,14 @@ export class CodexAdapter extends BaseAdapter { return skills; })); - const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); - const codexHome = homeResult.stdout.trim() || '/root/.codex'; + // Respect $CODEX_HOME; otherwise derive from $HOME. Some base images + // (e.g. node:20-slim) expand $HOME to '/' for root — fall back to /root + // so skills don't land in top-level dot-dirs. + const probe = await client.runCommand('printf "%s\\n%s" "${CODEX_HOME:-}" "${HOME:-}"'); + const [codexHomeRaw = '', homeRaw = ''] = probe.stdout.split('\n'); + const probedHome = homeRaw.trim(); + const home = probedHome && probedHome !== '/' ? probedHome : '/root'; + const codexHome = codexHomeRaw.trim() || `${home}/.codex`; const codexSkillsDir = `${codexHome}/skills`; const skillOwners = new Map(); diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index 03d3dcc..217b5d3 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -76,8 +76,10 @@ export class GeminiAdapter extends BaseAdapter { } })); - const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); - const home = homeResult.stdout.trim() || '/root'; + // Some base images (e.g. node:20-slim) expand $HOME to '/' for root. + // Fall back to /root so extensions don't land in top-level dot-dirs. + const probedHome = (await client.runCommand('printf %s "${HOME:-/root}"')).stdout.trim(); + const home = probedHome && probedHome !== '/' ? probedHome : '/root'; const extensionsDir = `${home}/.gemini/extensions`; await Promise.all(plugins.map((plugin) => From 930d643e6ed8b4c8c370199cf7579c317a7d3563 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 18:35:07 +1200 Subject: [PATCH 5/5] =?UTF-8?q?fix(plugins):=20derive=20install=20dir=20pu?= =?UTF-8?q?rely=20from=20$HOME=20=E2=80=94=20required=20for=20auto-discove?= =?UTF-8?q?ry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the /root fallback for degenerate $HOME. Verification on a node:20-slim microsandbox VM showed both codex and gemini resolve their home from the $HOME env var (which the image sets to '/' for root), and auto-discover skills/extensions from there — codex created /.codex, gemini reported '/.gemini/settings.json'. The /root fallback installed plugins where those CLIs never look, silently breaking discovery. The correct invariant: for an auto-discovery adapter, the install dir must equal the CLI's $HOME resolution. Since both the install probe and the agent CLI run in the same VM, deriving the install dir purely from $HOME guarantees they match — on any image. claude is unaffected either way (it loads plugins via an explicit --plugin-dir flag) but is reverted too so all three adapters stay consistent. --- src/agents/claude.ts | 6 ++---- src/agents/codex.ts | 10 ++-------- src/agents/gemini.ts | 6 ++---- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/agents/claude.ts b/src/agents/claude.ts index 109ec4d..47d77fc 100644 --- a/src/agents/claude.ts +++ b/src/agents/claude.ts @@ -132,10 +132,8 @@ export class ClaudeAdapter extends BaseAdapter { } })); - // Some base images (e.g. node:20-slim) expand $HOME to '/' for root. - // Fall back to /root so plugins don't land in top-level dot-dirs. - const probedHome = (await client.runCommand('printf %s "${HOME:-/root}"')).stdout.trim(); - const home = probedHome && probedHome !== '/' ? probedHome : '/root'; + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; const pluginsRoot = `${home}/.claude/plugins`; this.installedPluginDirs = await Promise.all(plugins.map(async (plugin) => { diff --git a/src/agents/codex.ts b/src/agents/codex.ts index 14dd5e2..0997e72 100644 --- a/src/agents/codex.ts +++ b/src/agents/codex.ts @@ -164,14 +164,8 @@ export class CodexAdapter extends BaseAdapter { return skills; })); - // Respect $CODEX_HOME; otherwise derive from $HOME. Some base images - // (e.g. node:20-slim) expand $HOME to '/' for root — fall back to /root - // so skills don't land in top-level dot-dirs. - const probe = await client.runCommand('printf "%s\\n%s" "${CODEX_HOME:-}" "${HOME:-}"'); - const [codexHomeRaw = '', homeRaw = ''] = probe.stdout.split('\n'); - const probedHome = homeRaw.trim(); - const home = probedHome && probedHome !== '/' ? probedHome : '/root'; - const codexHome = codexHomeRaw.trim() || `${home}/.codex`; + const homeResult = await client.runCommand('printf %s "${CODEX_HOME:-${HOME:-/root}/.codex}"'); + const codexHome = homeResult.stdout.trim() || '/root/.codex'; const codexSkillsDir = `${codexHome}/skills`; const skillOwners = new Map(); diff --git a/src/agents/gemini.ts b/src/agents/gemini.ts index 217b5d3..03d3dcc 100644 --- a/src/agents/gemini.ts +++ b/src/agents/gemini.ts @@ -76,10 +76,8 @@ export class GeminiAdapter extends BaseAdapter { } })); - // Some base images (e.g. node:20-slim) expand $HOME to '/' for root. - // Fall back to /root so extensions don't land in top-level dot-dirs. - const probedHome = (await client.runCommand('printf %s "${HOME:-/root}"')).stdout.trim(); - const home = probedHome && probedHome !== '/' ? probedHome : '/root'; + const homeResult = await client.runCommand('printf %s "${HOME:-/root}"'); + const home = homeResult.stdout.trim() || '/root'; const extensionsDir = `${home}/.gemini/extensions`; await Promise.all(plugins.map((plugin) =>