diff --git a/README.md b/README.md index eac76d8..982bb7e 100644 --- a/README.md +++ b/README.md @@ -364,6 +364,28 @@ Generator and insights agents run locally and do not require a secret. | `baseUrl` | API base URL. Hostname is used for network allowlisting. Auto-detected for known agents. | | `baseUrlEnvVar` | Override the base URL env var name. Auto-detected for known agents. | +#### Claude Code subscription auth (avoid API billing) + +If you have a Claude Pro / Max / Team / Enterprise subscription, sandbox agents using `command: "claude"` can authenticate via your subscription instead of paying per-token API charges. Point `secret.value` at the Claude Code OAuth token instead of an API key: + +```json +{ + "agents": { + "executor": { "command": "claude", "secret": { "value": "$CLAUDE_CODE_OAUTH_TOKEN" } }, + "judge": { "command": "claude", "secret": { "value": "$CLAUDE_CODE_OAUTH_TOKEN" } } + } +} +``` + +One-time host setup: + +```bash +claude setup-token # interactive — generates a long-lived OAuth token +export CLAUDE_CODE_OAUTH_TOKEN='' # before running the eval +``` + +How it works: the runtime sniffs the resolved value's prefix at sandbox-create time. Anthropic OAuth tokens start with `sk-ant-oat` (e.g. `sk-ant-oat01-…`); API keys start with `sk-ant-api` (e.g. `sk-ant-api03-…`). Both paths flow through microsandbox's `Secret.env()` TLS substitution — cleartext never enters the VM; the env var inside the sandbox contains a placeholder, and microsandbox swaps it for the real value on outbound TLS to `api.anthropic.com` only. The prefix only decides which env var name (`CLAUDE_CODE_OAUTH_TOKEN` vs `ANTHROPIC_API_KEY`) carries the placeholder. Subscription concurrent-session caps apply. + #### Custom agents Custom agents support additional args fields with `{prompt}` and `{workDir}` placeholders: diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 6e96773..276784f 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -82,6 +82,12 @@ Extends AgentConfig with one **required** field: |-------|------|----------| | `secret` | `AgentSecretConfig` | **Yes** | +The resolved `secret.value` is wired into the sandbox via microsandbox `Secret.env()` TLS substitution — the cleartext credential never enters the VM. Inside the sandbox the env var contains a `$MSB_` placeholder; microsandbox swaps it for the real value on outbound TLS to the allowed host only. + +By default the placeholder lands under the adapter's API-key env var (e.g. `ANTHROPIC_API_KEY` for claude, see [Known Agent Defaults](#known-agent-defaults-auto-filled-when-field-is-absent) below). + +**Claude-only: subscription auth.** When `command: "claude"` and the resolved value starts with `sk-ant-oat` (a Claude Code subscription OAuth token issued by `claude setup-token`, e.g. `sk-ant-oat01-…`), the placeholder lands under `CLAUDE_CODE_OAUTH_TOKEN` instead. This lets you bill the run against a Pro / Max / Team / Enterprise plan instead of per-token API charges. Point `secret.value` at `"$CLAUDE_CODE_OAUTH_TOKEN"` to opt in. Other adapters (codex, gemini, custom) only have the API-key path today. + ### AgentSecretConfig | Field | Type | Required | diff --git a/src/commands/__tests__/execute.test.ts b/src/commands/__tests__/execute.test.ts index 494ea7c..cd9a570 100644 --- a/src/commands/__tests__/execute.test.ts +++ b/src/commands/__tests__/execute.test.ts @@ -41,7 +41,7 @@ vi.mock('../../sandbox/microsandbox.js', () => { return { MicrosandboxClient: MockMicrosandboxClient, buildSecrets: vi.fn().mockReturnValue([]), - buildAgentSecret: vi.fn().mockReturnValue({}), + applyAgentAuth: vi.fn(), resolveEnv: vi.fn().mockReturnValue({}), }; }); diff --git a/src/commands/execute.ts b/src/commands/execute.ts index af24ee0..58654ee 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -3,7 +3,7 @@ import ora from 'ora'; import { loadDotenv } from '../core/env.js'; import { loadConfig } from '../core/config.js'; import { loadTestSuite, saveResult, saveBinaryResult, formatElapsed } from '../core/suite-io.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; import { scaffoldWorkspace } from '../sandbox/scaffolding.js'; import { WorkerPool } from '../sandbox/worker-pool.js'; @@ -158,15 +158,10 @@ export async function executeTestCase( const env = resolveEnv(config.sandbox?.env); const timeoutSecs = target.timeout ?? config.sandbox.defaultTimeout ?? 600; - // Merge agent secret into sandbox secrets and set base URL env var const executorConfig: SandboxAgentConfig = config.agents?.executor ?? { command: 'claude', secret: { value: '$ANTHROPIC_API_KEY' } }; const execAdapter = createAdapter(executorConfig); - secrets.push(buildAgentSecret(executorConfig.secret, execAdapter.additionalAllowHosts)); - const baseUrlVar = executorConfig.secret.baseUrlEnvVar ?? execAdapter.baseUrlEnvVar; - if (baseUrlVar && executorConfig.secret.baseUrl) { - env[baseUrlVar] = executorConfig.secret.baseUrl; - } + applyAgentAuth(executorConfig.secret, execAdapter, secrets, env); await client.create( sandboxName(testCase.id), diff --git a/src/commands/sandbox.ts b/src/commands/sandbox.ts index 7bb2e05..8645cf1 100644 --- a/src/commands/sandbox.ts +++ b/src/commands/sandbox.ts @@ -5,7 +5,7 @@ import { loadDotenv } from '../core/env.js'; import { loadConfig } from '../core/config.js'; import { loadTestSuite, loadBinaryResult } from '../core/suite-io.js'; import { loadJsonFile } from '../core/results.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv } from '../sandbox/microsandbox.js'; import { scaffoldWorkspace, uploadSources } from '../sandbox/scaffolding.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; import { createAdapter } from '../agents/adapter.js'; @@ -59,11 +59,7 @@ export async function sandboxCommand(paths: ProjectPaths, options: SandboxOption if (options.mode) { agentConfig = getAgentConfig(config, options.mode); adapter = createAdapter(agentConfig); - secrets.push(buildAgentSecret(agentConfig.secret, adapter.additionalAllowHosts)); - const baseUrlVar = agentConfig.secret.baseUrlEnvVar ?? adapter.baseUrlEnvVar; - if (baseUrlVar && agentConfig.secret.baseUrl) { - env[baseUrlVar] = agentConfig.secret.baseUrl; - } + applyAgentAuth(agentConfig.secret, adapter, secrets, env); } // Prepare output directory for artifacts diff --git a/src/core/__tests__/config.test.ts b/src/core/__tests__/config.test.ts index 61fe7da..b8bf405 100644 --- a/src/core/__tests__/config.test.ts +++ b/src/core/__tests__/config.test.ts @@ -223,4 +223,14 @@ describe('loadConfig', () => { mockReadFile.mockResolvedValue(JSON.stringify(config)); await expect(loadConfig('/fake/config.json')).rejects.toThrow(/valid URL/); }); + + it('accepts secret pointing at $CLAUDE_CODE_OAUTH_TOKEN (auth mode resolved later by value prefix)', async () => { + const config = { + ...validConfig, + agents: { judge: { command: 'claude', secret: { value: '$CLAUDE_CODE_OAUTH_TOKEN' } } }, + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.agents?.judge?.secret?.value).toBe('$CLAUDE_CODE_OAUTH_TOKEN'); + }); }); diff --git a/src/core/config.ts b/src/core/config.ts index 6f2df48..a1d618b 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -129,7 +129,9 @@ export function validateConfig(data: unknown, configPath?: string): Config { const isSandboxRole = SANDBOX_ROLES.includes(role); if (isSandboxRole) { - // Sandbox agents (executor/judge) require secret + // Sandbox agents (executor/judge) require secret. Auth mode (API key vs Claude Code + // subscription OAuth token) is auto-detected from the resolved value's prefix at + // sandbox-create time. if (!agent.secret || typeof agent.secret !== 'object' || Array.isArray(agent.secret)) { throw new Error(`agents.${role} requires a secret with at least { value } for secure sandbox execution`); } diff --git a/src/sandbox/__tests__/microsandbox.test.ts b/src/sandbox/__tests__/microsandbox.test.ts index a927d73..fe823b4 100644 --- a/src/sandbox/__tests__/microsandbox.test.ts +++ b/src/sandbox/__tests__/microsandbox.test.ts @@ -1,5 +1,5 @@ -import { describe, it, expect, vi, beforeEach } from 'vitest'; -import { MicrosandboxClient, buildSecrets, resolveEnv } from '../microsandbox.js'; +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { MicrosandboxClient, buildSecrets, resolveEnv, applyAgentAuth } from '../microsandbox.js'; // ── Mocks ──────────────────────────────────────────────────────────────────── @@ -328,4 +328,64 @@ describe('MicrosandboxClient', () => { await expect(client.destroy()).resolves.toBeUndefined(); }); }); +}); + +describe('applyAgentAuth', () => { + const ORIGINAL_API_KEY = process.env.ANTHROPIC_API_KEY; + const ORIGINAL_OAUTH = process.env.CLAUDE_CODE_OAUTH_TOKEN; + + beforeEach(() => { + vi.clearAllMocks(); + }); + + afterEach(() => { + const restore = (key: string, value: string | undefined) => { + if (value === undefined) delete process.env[key]; + else process.env[key] = value; + }; + restore('ANTHROPIC_API_KEY', ORIGINAL_API_KEY); + restore('CLAUDE_CODE_OAUTH_TOKEN', ORIGINAL_OAUTH); + }); + + const claudeAdapter = { + baseUrlEnvVar: 'ANTHROPIC_BASE_URL', + additionalAllowHosts: [], + }; + + it('routes an OAuth-prefixed value through Secret.env under CLAUDE_CODE_OAUTH_TOKEN', async () => { + const { Secret } = await import('microsandbox'); + process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat01-fake-test-token'; + applyAgentAuth({ + envVar: 'CLAUDE_CODE_OAUTH_TOKEN', + value: '$CLAUDE_CODE_OAUTH_TOKEN', + baseUrl: 'https://api.anthropic.com', + }, claudeAdapter, [], {}); + expect(Secret.env).toHaveBeenCalledWith('CLAUDE_CODE_OAUTH_TOKEN', expect.objectContaining({ + value: 'sk-ant-oat01-fake-test-token', + allowHosts: ['api.anthropic.com'], + })); + }); + + it('routes an API-key value through Secret.env under the agent-specific env var', async () => { + const { Secret } = await import('microsandbox'); + process.env.ANTHROPIC_API_KEY = 'sk-ant-api03-fake-test-key'; + const env: Record = {}; + applyAgentAuth({ + envVar: 'ANTHROPIC_API_KEY', + value: '$ANTHROPIC_API_KEY', + baseUrl: 'https://api.anthropic.com', + baseUrlEnvVar: 'ANTHROPIC_BASE_URL', + }, claudeAdapter, [], env); + expect(Secret.env).toHaveBeenCalledWith('ANTHROPIC_API_KEY', expect.objectContaining({ + value: 'sk-ant-api03-fake-test-key', + allowHosts: ['api.anthropic.com'], + })); + expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); + }); + + it('throws when envVar or baseUrl is missing', () => { + expect(() => applyAgentAuth({ + value: 'literal-value', + } as never, claudeAdapter, [], {})).toThrow(/envVar and baseUrl/); + }); }); \ No newline at end of file diff --git a/src/sandbox/microsandbox.ts b/src/sandbox/microsandbox.ts index daa23bf..b8e6434 100644 --- a/src/sandbox/microsandbox.ts +++ b/src/sandbox/microsandbox.ts @@ -5,6 +5,7 @@ import type { FsEntry, } from 'microsandbox'; import type { SandboxConfig, SecretConfig, AgentSecretConfig } from '../types.js'; +import type { AgentAdapter } from '../agents/adapter.js'; export interface CommandResult { stdout: string; @@ -49,18 +50,54 @@ export function resolveEnv( return resolved; } +// Claude-specific credential format. Subscription OAuth tokens are prefixed +// `sk-ant-oat` followed by a version (e.g. `sk-ant-oat01-…`), issued by +// `claude setup-token`. API keys use `sk-ant-api`. The framework picks the +// env-var slot the placeholder lands under by inspecting the resolved +// value's prefix — no separate config flag needed. +const OAUTH_TOKEN_PREFIX = 'sk-ant-oat'; +const OAUTH_TOKEN_ENV_VAR = 'CLAUDE_CODE_OAUTH_TOKEN'; + /** - * Build a microsandbox `Secret.env()` entry from an agent's secret config. - * The `allowHosts` is derived from the base URL hostname. + * Wire an agent's secret into the sandbox `secrets` and `env`. + * + * Both auth modes (API key and Claude Code subscription OAuth) go through + * microsandbox `Secret.env()` TLS substitution — the cleartext value never + * enters the VM. Inside the sandbox the env var contains the + * `$MSB_` placeholder; microsandbox swaps it for the real value + * on outbound TLS to the allowed host only. + * + * The resolved value's prefix picks which env var name carries the placeholder: + * - `sk-ant-oat…` (Claude Code subscription OAuth, issued by `claude setup-token`) + * → `CLAUDE_CODE_OAUTH_TOKEN` + * - anything else (API keys for known agents, custom-agent secrets) + * → `secret.envVar` (= `ANTHROPIC_API_KEY` for claude, etc.) + * + * Mutates `secrets` and `env` in place. */ -export function buildAgentSecret(secret: AgentSecretConfig, additionalAllowHosts?: string[]): SecretEntry { +export function applyAgentAuth( + secret: AgentSecretConfig, + adapter: Pick, + secrets: SecretEntry[], + env: Record, +): void { if (!secret.envVar || !secret.baseUrl) { throw new Error('Agent secret must have envVar and baseUrl set (should be filled by config validation)'); } const value = resolveValue(secret.value, secret.envVar); + + const envVar = value.startsWith(OAUTH_TOKEN_PREFIX) + ? OAUTH_TOKEN_ENV_VAR + : secret.envVar; + const hostname = new URL(secret.baseUrl).hostname; - const allowHosts = [hostname, ...(additionalAllowHosts ?? [])]; - return Secret.env(secret.envVar, { value, allowHosts }); + const allowHosts = [hostname, ...adapter.additionalAllowHosts]; + secrets.push(Secret.env(envVar, { value, allowHosts })); + + const baseUrlVar = secret.baseUrlEnvVar ?? adapter.baseUrlEnvVar; + if (baseUrlVar) { + env[baseUrlVar] = secret.baseUrl; + } } function resolveValue(value: string, envVar: string): string { diff --git a/src/scoring/__tests__/judge.test.ts b/src/scoring/__tests__/judge.test.ts index 87c5b1e..c8952c6 100644 --- a/src/scoring/__tests__/judge.test.ts +++ b/src/scoring/__tests__/judge.test.ts @@ -23,7 +23,7 @@ vi.mock('../../sandbox/microsandbox.js', () => ({ Object.assign(this, mockClient); }), buildSecrets: vi.fn().mockReturnValue([]), - buildAgentSecret: vi.fn().mockReturnValue({}), + applyAgentAuth: vi.fn(), resolveEnv: vi.fn().mockReturnValue({}), })); diff --git a/src/scoring/judge.ts b/src/scoring/judge.ts index ce6af0b..fe2c42d 100644 --- a/src/scoring/judge.ts +++ b/src/scoring/judge.ts @@ -1,7 +1,7 @@ import type { SolutionFile, JudgeScore, TestCase, SandboxAgentConfig, TargetConfig, Config, ProjectPaths, SourceConfig } from '../types.js'; import { createAdapter } from '../agents/adapter.js'; import { JUDGE_SCORING_CRITERIA, extractJson } from '../commands/prompt-helpers.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv } from '../sandbox/microsandbox.js'; import { createEgressLockdownLogger } from '../sandbox/egress-logger.js'; import { scaffoldWorkspace, uploadSources } from '../sandbox/scaffolding.js'; import { deduplicateSources } from '../core/source-resolver.js'; @@ -135,7 +135,7 @@ const INFRA_ALLOWLIST = [ export function buildJudgeAllowlist(judgeConfig: SandboxAgentConfig, config: Config): string[] { const hosts = new Set(); - // 1. Agent API endpoint from secret's baseUrl + // 1. Agent API endpoint from secret.baseUrl if (judgeConfig.secret.baseUrl) { try { hosts.add(new URL(judgeConfig.secret.baseUrl).hostname); } catch { /* skip malformed */ } } @@ -286,13 +286,8 @@ export async function runSandboxedJudge( const env = resolveEnv(config.sandbox?.env); const timeoutSecs = target.timeout ?? config.sandbox.defaultTimeout ?? 600; - // Merge agent secret into sandbox secrets and set base URL env var const judgeAdapter = createAdapter(judgeConfig); - secrets.push(buildAgentSecret(judgeConfig.secret, judgeAdapter.additionalAllowHosts)); - const baseUrlVar = judgeConfig.secret.baseUrlEnvVar ?? judgeAdapter.baseUrlEnvVar; - if (baseUrlVar && judgeConfig.secret.baseUrl) { - env[baseUrlVar] = judgeConfig.secret.baseUrl; - } + applyAgentAuth(judgeConfig.secret, judgeAdapter, secrets, env); await client.create( sandboxName(testCase.id), diff --git a/src/types.ts b/src/types.ts index c3dae8f..1621ace 100644 --- a/src/types.ts +++ b/src/types.ts @@ -109,9 +109,28 @@ export interface AgentConfig { logPattern?: string; } -/** Agent config for sandboxed execution (executor/judge). Secret is required for microsandbox TLS injection. */ +/** Agent config for sandboxed execution (executor/judge). + * + * Both auth modes flow through microsandbox `Secret.env()` TLS substitution — + * the cleartext credential never enters the VM. Inside the sandbox the env + * var contains a `$MSB_` placeholder; microsandbox swaps it for the + * real value on outbound TLS to the allowed host only. + * + * The resolved `secret.value`'s prefix picks which env var name carries the + * placeholder: + * + * - `sk-ant-oat…` (Claude Code subscription OAuth token, issued by + * `claude setup-token`, requires Pro / Max / Team / Enterprise) → + * `CLAUDE_CODE_OAUTH_TOKEN`. Avoids per-token API billing. + * - anything else (API keys for known agents, custom-agent secrets) → + * `secret.envVar` (= `ANTHROPIC_API_KEY` for claude, etc.). + * + * Point `secret.value` at the host env var that holds the credential — + * `$ANTHROPIC_API_KEY` for the API-key path, `$CLAUDE_CODE_OAUTH_TOKEN` for + * the subscription path. + */ export interface SandboxAgentConfig extends AgentConfig { - /** Agent's API secret and base URL. Flows to microsandbox TLS injection, sandbox env, and judge lockdown allowlist. */ + /** Agent's secret and base URL. Auth mode is determined from the resolved value's prefix. */ secret: AgentSecretConfig; }