From de61e0406f981ba5c5f3f6354731e8d85f713727 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Wed, 13 May 2026 15:41:23 +1200 Subject: [PATCH 1/6] feat(auth): add useOAuth path so sandboxed Claude agents use a subscription token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a first-class subscription-auth option for executor/judge sandbox roles: "agents": { "executor": { "command": "claude", "useOAuth": true }, "judge": { "command": "claude", "useOAuth": true } } Why this exists: per-token API billing for a full A/B sweep against a real SDK costs ~$135-$270 (Opus 4.7); we just stopped a run partway in at ~$30 sunk. Claude Code on a Pro/Max/Team/Enterprise plan can authenticate via a long-lived subscription token instead — flat-rate billing tied to the plan. How it works: the framework's existing `secret` path uses microsandbox TLS-injection — the cleartext value never enters the VM, only a placeholder substituted on the wire for the allowed host. That model is fundamentally incompatible with `CLAUDE_CODE_OAUTH_TOKEN` because Claude reads the token directly from `process.env`. So `useOAuth: true`: - Resolves CLAUDE_CODE_OAUTH_TOKEN from the host environment (fail-fast with a setup-token hint if unset) - Injects it into the sandbox as a plain env var via `sandbox.env` - Skips `buildAgentSecret` for that role (no API key in env at all, so Claude's auth precedence falls through cleanly to OAuth) - Sets ANTHROPIC_BASE_URL from the adapter default - For judge: contributes the adapter default hostname to the network lockdown allowlist Validation: exactly one of `secret` or `useOAuth: true` must be set per sandbox role. `useOAuth: true` requires `command: "claude"`. Setting both is rejected. Adapter-side enforcement keeps the auth surface intentionally narrow. User flow (one-time host setup): claude setup-token # interactive, ~1 yr token export CLAUDE_CODE_OAUTH_TOKEN='' # then run the eval Tests: 354 pass (added 6 — 4 config validation + 2 resolveOAuthToken). README + config-schema reference document the new path. Type-check clean. --- README.md | 26 ++++++++ skills/_reference/config-schema.md | 10 ++- src/commands/execute.ts | 21 +++++-- src/commands/sandbox.ts | 17 ++++-- src/core/__tests__/config.test.ts | 48 +++++++++++++-- src/core/config.ts | 71 ++++++++++++++-------- src/sandbox/__tests__/microsandbox.test.ts | 26 +++++++- src/sandbox/microsandbox.ts | 23 +++++++ src/scoring/judge.ts | 28 ++++++--- src/types.ts | 20 +++++- 10 files changed, 235 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index eac76d8..26b1838 100644 --- a/README.md +++ b/README.md @@ -364,6 +364,32 @@ Generator and insights agents run locally and do not require a secret. | `baseUrl` | API base URL. Hostname is used for network allowlisting. Auto-detected for known agents. | | `baseUrlEnvVar` | Override the base URL env var name. Auto-detected for known agents. | +#### Claude Code subscription auth (avoid API billing) + +If you have a Claude Pro / Max / Team / Enterprise subscription, sandbox agents using `command: "claude"` can authenticate via your subscription instead of paying per-token API charges. Set `useOAuth: true` instead of providing a `secret`: + +```json +{ + "agents": { + "executor": { "command": "claude", "useOAuth": true }, + "judge": { "command": "claude", "useOAuth": true } + } +} +``` + +One-time host setup: + +```bash +claude setup-token # interactive — generates a long-lived OAuth token +export CLAUDE_CODE_OAUTH_TOKEN='' # before running the eval +``` + +Notes: +- Token is injected into the sandbox as a plain env var (Claude reads it directly from `process.env`; the API-key TLS-substitution model does not apply). +- Only valid for `command: "claude"`. The framework rejects `useOAuth: true` on other adapters at config-load time. +- Setting both `secret` and `useOAuth` is rejected — choose one path per role. +- Subscription concurrent-session caps apply. + #### Custom agents Custom agents support additional args fields with `{prompt}` and `{workDir}` placeholders: diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 6e96773..c1bb0a5 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -76,11 +76,16 @@ ### SandboxAgentConfig (executor, judge) -Extends AgentConfig with one **required** field: +Extends AgentConfig with auth fields. **Exactly one of `secret` or `useOAuth` is required.** | Field | Type | Required | |-------|------|----------| -| `secret` | `AgentSecretConfig` | **Yes** | +| `secret` | `AgentSecretConfig` | One-of — API key via TLS injection | +| `useOAuth` | `boolean` | One-of — Claude Code subscription via `CLAUDE_CODE_OAUTH_TOKEN`. Only valid when `command: "claude"`. | + +**API-key path (`secret`)** — microsandbox TLS-injects the value, so the cleartext never enters the VM. Inside the sandbox the env var contains only a placeholder substituted on the wire for the allowed host. + +**Subscription path (`useOAuth: true`)** — reads `CLAUDE_CODE_OAUTH_TOKEN` from the host environment and injects it into the sandbox as a plain env var. Claude reads the token directly. Generate the token once with `claude setup-token` (Pro / Max / Team / Enterprise required), export it, then `useOAuth: true` on both executor and judge. Avoids per-token API billing. ### AgentSecretConfig @@ -147,6 +152,7 @@ Custom agents (any command not in the table above) **must** provide `envVar` and 7. `agents.executor` and `agents.judge` must have `secret.value` (non-empty string) 8. Custom agents must provide `envVar` and `baseUrl` in their secret 9. `baseUrl` must be a parseable URL +10. Each sandbox agent role (`executor`, `judge`) must declare auth: either `secret` (API-key path) or `useOAuth: true` (Claude Code subscription path). Setting both is rejected. `useOAuth: true` requires `command: "claude"`. ## Minimal Examples diff --git a/src/commands/execute.ts b/src/commands/execute.ts index af24ee0..77f43a7 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -3,7 +3,7 @@ import ora from 'ora'; import { loadDotenv } from '../core/env.js'; import { loadConfig } from '../core/config.js'; import { loadTestSuite, saveResult, saveBinaryResult, formatElapsed } from '../core/suite-io.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, resolveOAuthToken, type CommandResult } from '../sandbox/microsandbox.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; import { scaffoldWorkspace } from '../sandbox/scaffolding.js'; import { WorkerPool } from '../sandbox/worker-pool.js'; @@ -158,14 +158,23 @@ export async function executeTestCase( const env = resolveEnv(config.sandbox?.env); const timeoutSecs = target.timeout ?? config.sandbox.defaultTimeout ?? 600; - // Merge agent secret into sandbox secrets and set base URL env var + // Resolve agent auth. Two paths: + // - secret → microsandbox TLS-injected placeholder for an API key + // - useOAuth → plain CLAUDE_CODE_OAUTH_TOKEN env var (claude reads it directly) const executorConfig: SandboxAgentConfig = config.agents?.executor ?? { command: 'claude', secret: { value: '$ANTHROPIC_API_KEY' } }; const execAdapter = createAdapter(executorConfig); - secrets.push(buildAgentSecret(executorConfig.secret, execAdapter.additionalAllowHosts)); - const baseUrlVar = executorConfig.secret.baseUrlEnvVar ?? execAdapter.baseUrlEnvVar; - if (baseUrlVar && executorConfig.secret.baseUrl) { - env[baseUrlVar] = executorConfig.secret.baseUrl; + if (executorConfig.useOAuth) { + env.CLAUDE_CODE_OAUTH_TOKEN = resolveOAuthToken(); + if (execAdapter.baseUrlEnvVar && execAdapter.defaultBaseUrl) { + env[execAdapter.baseUrlEnvVar] = execAdapter.defaultBaseUrl; + } + } else if (executorConfig.secret) { + secrets.push(buildAgentSecret(executorConfig.secret, execAdapter.additionalAllowHosts)); + const baseUrlVar = executorConfig.secret.baseUrlEnvVar ?? execAdapter.baseUrlEnvVar; + if (baseUrlVar && executorConfig.secret.baseUrl) { + env[baseUrlVar] = executorConfig.secret.baseUrl; + } } await client.create( diff --git a/src/commands/sandbox.ts b/src/commands/sandbox.ts index 7bb2e05..2ca5c45 100644 --- a/src/commands/sandbox.ts +++ b/src/commands/sandbox.ts @@ -5,7 +5,7 @@ import { loadDotenv } from '../core/env.js'; import { loadConfig } from '../core/config.js'; import { loadTestSuite, loadBinaryResult } from '../core/suite-io.js'; import { loadJsonFile } from '../core/results.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, resolveOAuthToken } from '../sandbox/microsandbox.js'; import { scaffoldWorkspace, uploadSources } from '../sandbox/scaffolding.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; import { createAdapter } from '../agents/adapter.js'; @@ -59,10 +59,17 @@ export async function sandboxCommand(paths: ProjectPaths, options: SandboxOption if (options.mode) { agentConfig = getAgentConfig(config, options.mode); adapter = createAdapter(agentConfig); - secrets.push(buildAgentSecret(agentConfig.secret, adapter.additionalAllowHosts)); - const baseUrlVar = agentConfig.secret.baseUrlEnvVar ?? adapter.baseUrlEnvVar; - if (baseUrlVar && agentConfig.secret.baseUrl) { - env[baseUrlVar] = agentConfig.secret.baseUrl; + if (agentConfig.useOAuth) { + env.CLAUDE_CODE_OAUTH_TOKEN = resolveOAuthToken(); + if (adapter.baseUrlEnvVar && adapter.defaultBaseUrl) { + env[adapter.baseUrlEnvVar] = adapter.defaultBaseUrl; + } + } else if (agentConfig.secret) { + secrets.push(buildAgentSecret(agentConfig.secret, adapter.additionalAllowHosts)); + const baseUrlVar = agentConfig.secret.baseUrlEnvVar ?? adapter.baseUrlEnvVar; + if (baseUrlVar && agentConfig.secret.baseUrl) { + env[baseUrlVar] = agentConfig.secret.baseUrl; + } } } diff --git a/src/core/__tests__/config.test.ts b/src/core/__tests__/config.test.ts index 61fe7da..7b08938 100644 --- a/src/core/__tests__/config.test.ts +++ b/src/core/__tests__/config.test.ts @@ -149,9 +149,9 @@ describe('loadConfig', () => { mockReadFile.mockResolvedValue(JSON.stringify(config)); const result = await loadConfig('/fake/config.json'); // Defaults should be filled in - expect(result.agents?.judge?.secret.envVar).toBe('ANTHROPIC_API_KEY'); - expect(result.agents?.judge?.secret.baseUrl).toBe('https://api.anthropic.com'); - expect(result.agents?.judge?.secret.baseUrlEnvVar).toBe('ANTHROPIC_BASE_URL'); + expect(result.agents?.judge?.secret?.envVar).toBe('ANTHROPIC_API_KEY'); + expect(result.agents?.judge?.secret?.baseUrl).toBe('https://api.anthropic.com'); + expect(result.agents?.judge?.secret?.baseUrlEnvVar).toBe('ANTHROPIC_BASE_URL'); }); it('accepts known agent with all secret fields explicit', async () => { @@ -166,7 +166,7 @@ describe('loadConfig', () => { }; mockReadFile.mockResolvedValue(JSON.stringify(config)); const result = await loadConfig('/fake/config.json'); - expect(result.agents?.judge?.secret.envVar).toBe('ANTHROPIC_API_KEY'); + expect(result.agents?.judge?.secret?.envVar).toBe('ANTHROPIC_API_KEY'); }); it('throws when sandbox agent (executor) is missing secret', async () => { @@ -223,4 +223,44 @@ describe('loadConfig', () => { mockReadFile.mockResolvedValue(JSON.stringify(config)); await expect(loadConfig('/fake/config.json')).rejects.toThrow(/valid URL/); }); + + describe('useOAuth (Claude Code subscription auth)', () => { + it('accepts judge with useOAuth: true and no secret', async () => { + const config = { + ...validConfig, + agents: { judge: { command: 'claude', useOAuth: true } }, + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.agents?.judge?.useOAuth).toBe(true); + expect(result.agents?.judge?.secret).toBeUndefined(); + }); + + it('rejects useOAuth with command != claude', async () => { + const config = { + ...validConfig, + agents: { judge: { command: 'codex', useOAuth: true } }, + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/useOAuth.*only supported for command: "claude"/); + }); + + it('rejects sandbox role with neither secret nor useOAuth', async () => { + const config = { + ...validConfig, + agents: { judge: { command: 'claude' } }, + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/secret.*or.*useOAuth/); + }); + + it('rejects setting both secret and useOAuth on the same role', async () => { + const config = { + ...validConfig, + agents: { judge: { command: 'claude', useOAuth: true, secret: { value: '$ANTHROPIC_API_KEY' } } }, + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + await expect(loadConfig('/fake/config.json')).rejects.toThrow(/cannot set both useOAuth and secret/); + }); + }); }); diff --git a/src/core/config.ts b/src/core/config.ts index 6f2df48..47408c4 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -129,36 +129,55 @@ export function validateConfig(data: unknown, configPath?: string): Config { const isSandboxRole = SANDBOX_ROLES.includes(role); if (isSandboxRole) { - // Sandbox agents (executor/judge) require secret - if (!agent.secret || typeof agent.secret !== 'object' || Array.isArray(agent.secret)) { - throw new Error(`agents.${role} requires a secret with at least { value } for secure sandbox execution`); - } - const secret = agent.secret as Record; - if (!secret.value || typeof secret.value !== 'string') { - throw new Error(`agents.${role}.secret.value must be a non-empty string`); - } - - // Fill defaults from adapter for known agents - const adapter = createAdapter({ command } as AgentConfig); - if (adapter.defaultEnvVar) { - if (!secret.envVar) secret.envVar = adapter.defaultEnvVar; - if (!secret.baseUrl) secret.baseUrl = adapter.defaultBaseUrl; - if (!secret.baseUrlEnvVar) secret.baseUrlEnvVar = adapter.baseUrlEnvVar; + const useOAuth = agent.useOAuth === true; + + if (useOAuth) { + // OAuth path: Claude Code subscription via CLAUDE_CODE_OAUTH_TOKEN. + if (command !== 'claude') { + throw new Error( + `agents.${role}.useOAuth: true is only supported for command: "claude" (Claude Code subscription auth). ` + + `Got command: "${command ?? '(unset)'}".`, + ); + } + if (agent.secret !== undefined) { + throw new Error( + `agents.${role} cannot set both useOAuth and secret — choose one auth path.`, + ); + } } else { - // Custom agents must specify envVar and baseUrl - if (!secret.envVar || typeof secret.envVar !== 'string') { - throw new Error(`agents.${role}.secret.envVar is required for custom agent '${command}'`); + // API-key path: secret with TLS-injected value. + if (!agent.secret || typeof agent.secret !== 'object' || Array.isArray(agent.secret)) { + throw new Error( + `agents.${role} requires either { secret: {...} } or { useOAuth: true } for sandbox auth`, + ); } - if (!secret.baseUrl || typeof secret.baseUrl !== 'string') { - throw new Error(`agents.${role}.secret.baseUrl is required for custom agent '${command}'`); + const secret = agent.secret as Record; + if (!secret.value || typeof secret.value !== 'string') { + throw new Error(`agents.${role}.secret.value must be a non-empty string`); + } + + // Fill defaults from adapter for known agents + const adapter = createAdapter({ command } as AgentConfig); + if (adapter.defaultEnvVar) { + if (!secret.envVar) secret.envVar = adapter.defaultEnvVar; + if (!secret.baseUrl) secret.baseUrl = adapter.defaultBaseUrl; + if (!secret.baseUrlEnvVar) secret.baseUrlEnvVar = adapter.baseUrlEnvVar; + } else { + // Custom agents must specify envVar and baseUrl + if (!secret.envVar || typeof secret.envVar !== 'string') { + throw new Error(`agents.${role}.secret.envVar is required for custom agent '${command}'`); + } + if (!secret.baseUrl || typeof secret.baseUrl !== 'string') { + throw new Error(`agents.${role}.secret.baseUrl is required for custom agent '${command}'`); + } } - } - // Validate baseUrl is a valid URL - try { - new URL(secret.baseUrl as string); - } catch { - throw new Error(`agents.${role}.secret.baseUrl must be a valid URL`); + // Validate baseUrl is a valid URL + try { + new URL(secret.baseUrl as string); + } catch { + throw new Error(`agents.${role}.secret.baseUrl must be a valid URL`); + } } } } diff --git a/src/sandbox/__tests__/microsandbox.test.ts b/src/sandbox/__tests__/microsandbox.test.ts index a927d73..31f0736 100644 --- a/src/sandbox/__tests__/microsandbox.test.ts +++ b/src/sandbox/__tests__/microsandbox.test.ts @@ -1,5 +1,5 @@ -import { describe, it, expect, vi, beforeEach } from 'vitest'; -import { MicrosandboxClient, buildSecrets, resolveEnv } from '../microsandbox.js'; +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { MicrosandboxClient, buildSecrets, resolveEnv, resolveOAuthToken } from '../microsandbox.js'; // ── Mocks ──────────────────────────────────────────────────────────────────── @@ -328,4 +328,26 @@ describe('MicrosandboxClient', () => { await expect(client.destroy()).resolves.toBeUndefined(); }); }); +}); + +describe('resolveOAuthToken', () => { + const ORIGINAL_TOKEN = process.env.CLAUDE_CODE_OAUTH_TOKEN; + + afterEach(() => { + if (ORIGINAL_TOKEN === undefined) { + delete process.env.CLAUDE_CODE_OAUTH_TOKEN; + } else { + process.env.CLAUDE_CODE_OAUTH_TOKEN = ORIGINAL_TOKEN; + } + }); + + it('returns the token when CLAUDE_CODE_OAUTH_TOKEN is set on the host', () => { + process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat-test-value'; + expect(resolveOAuthToken()).toBe('sk-ant-oat-test-value'); + }); + + it('throws with a clear setup-token hint when CLAUDE_CODE_OAUTH_TOKEN is unset', () => { + delete process.env.CLAUDE_CODE_OAUTH_TOKEN; + expect(() => resolveOAuthToken()).toThrow(/claude setup-token/); + }); }); \ No newline at end of file diff --git a/src/sandbox/microsandbox.ts b/src/sandbox/microsandbox.ts index daa23bf..73f3a2e 100644 --- a/src/sandbox/microsandbox.ts +++ b/src/sandbox/microsandbox.ts @@ -63,6 +63,29 @@ export function buildAgentSecret(secret: AgentSecretConfig, additionalAllowHosts return Secret.env(secret.envVar, { value, allowHosts }); } +/** + * Resolve the Claude Code OAuth token from the host environment for + * `useOAuth: true` agent configs. Unlike API keys (TLS-injected as + * placeholders by microsandbox), OAuth tokens must enter the VM as the real + * value because Claude reads them directly from `process.env`. The caller + * places the returned value under `sandbox.env.CLAUDE_CODE_OAUTH_TOKEN`. + * + * Throws with a clear message if `CLAUDE_CODE_OAUTH_TOKEN` is not set on + * the host — fail-fast so the user knows to run `claude setup-token` and + * export the result before the eval starts. + */ +export function resolveOAuthToken(): string { + const value = process.env.CLAUDE_CODE_OAUTH_TOKEN; + if (!value) { + throw new Error( + "CLAUDE_CODE_OAUTH_TOKEN is not set on the host. " + + "Generate a long-lived subscription token with `claude setup-token` " + + "and `export CLAUDE_CODE_OAUTH_TOKEN=` before running the eval.", + ); + } + return value; +} + function resolveValue(value: string, envVar: string): string { if (value.startsWith('$')) { const hostVar = value.slice(1); diff --git a/src/scoring/judge.ts b/src/scoring/judge.ts index ce6af0b..dcf553d 100644 --- a/src/scoring/judge.ts +++ b/src/scoring/judge.ts @@ -1,7 +1,7 @@ import type { SolutionFile, JudgeScore, TestCase, SandboxAgentConfig, TargetConfig, Config, ProjectPaths, SourceConfig } from '../types.js'; import { createAdapter } from '../agents/adapter.js'; import { JUDGE_SCORING_CRITERIA, extractJson } from '../commands/prompt-helpers.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, resolveOAuthToken } from '../sandbox/microsandbox.js'; import { createEgressLockdownLogger } from '../sandbox/egress-logger.js'; import { scaffoldWorkspace, uploadSources } from '../sandbox/scaffolding.js'; import { deduplicateSources } from '../core/source-resolver.js'; @@ -135,9 +135,14 @@ const INFRA_ALLOWLIST = [ export function buildJudgeAllowlist(judgeConfig: SandboxAgentConfig, config: Config): string[] { const hosts = new Set(); - // 1. Agent API endpoint from secret's baseUrl - if (judgeConfig.secret.baseUrl) { + // 1. Agent API endpoint — from secret.baseUrl (API-key path) or adapter default (OAuth path). + if (judgeConfig.secret?.baseUrl) { try { hosts.add(new URL(judgeConfig.secret.baseUrl).hostname); } catch { /* skip malformed */ } + } else if (judgeConfig.useOAuth) { + const adapter = createAdapter(judgeConfig); + if (adapter.defaultBaseUrl) { + try { hosts.add(new URL(adapter.defaultBaseUrl).hostname); } catch { /* skip malformed */ } + } } // 2. Secrets allowHosts @@ -286,12 +291,19 @@ export async function runSandboxedJudge( const env = resolveEnv(config.sandbox?.env); const timeoutSecs = target.timeout ?? config.sandbox.defaultTimeout ?? 600; - // Merge agent secret into sandbox secrets and set base URL env var + // Resolve agent auth — same two-path model as the executor. const judgeAdapter = createAdapter(judgeConfig); - secrets.push(buildAgentSecret(judgeConfig.secret, judgeAdapter.additionalAllowHosts)); - const baseUrlVar = judgeConfig.secret.baseUrlEnvVar ?? judgeAdapter.baseUrlEnvVar; - if (baseUrlVar && judgeConfig.secret.baseUrl) { - env[baseUrlVar] = judgeConfig.secret.baseUrl; + if (judgeConfig.useOAuth) { + env.CLAUDE_CODE_OAUTH_TOKEN = resolveOAuthToken(); + if (judgeAdapter.baseUrlEnvVar && judgeAdapter.defaultBaseUrl) { + env[judgeAdapter.baseUrlEnvVar] = judgeAdapter.defaultBaseUrl; + } + } else if (judgeConfig.secret) { + secrets.push(buildAgentSecret(judgeConfig.secret, judgeAdapter.additionalAllowHosts)); + const baseUrlVar = judgeConfig.secret.baseUrlEnvVar ?? judgeAdapter.baseUrlEnvVar; + if (baseUrlVar && judgeConfig.secret.baseUrl) { + env[baseUrlVar] = judgeConfig.secret.baseUrl; + } } await client.create( diff --git a/src/types.ts b/src/types.ts index c3dae8f..4e04224 100644 --- a/src/types.ts +++ b/src/types.ts @@ -109,10 +109,26 @@ export interface AgentConfig { logPattern?: string; } -/** Agent config for sandboxed execution (executor/judge). Secret is required for microsandbox TLS injection. */ +/** Agent config for sandboxed execution (executor/judge). + * + * Auth: exactly one of `secret` or `useOAuth: true` is required. + * + * - `secret` is the API-key path — values are TLS-injected by microsandbox so + * the cleartext never enters the VM, and the env contains a placeholder + * substituted on the wire only for the agent's allowed host. + * - `useOAuth: true` is the Claude Code subscription path — reads + * `CLAUDE_CODE_OAUTH_TOKEN` (generated by `claude setup-token`, requires + * Pro / Max / Team / Enterprise) from the host environment and injects it + * into the sandbox as a plain env var. Subscription auth is required here + * because Claude reads the token directly from `process.env`; the TLS + * substitution model does not work for OAuth. Only valid when + * `command: "claude"`. + */ export interface SandboxAgentConfig extends AgentConfig { /** Agent's API secret and base URL. Flows to microsandbox TLS injection, sandbox env, and judge lockdown allowlist. */ - secret: AgentSecretConfig; + secret?: AgentSecretConfig; + /** Use Claude Code subscription auth via `CLAUDE_CODE_OAUTH_TOKEN`. Only valid for `command: "claude"`. */ + useOAuth?: boolean; } export interface TargetConfig { From 0422269ea4e635a72b062658a888787a9b3efc5d Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 10:28:15 +1200 Subject: [PATCH 2/6] refactor(auth): detect auth mode from secret.value prefix, drop useOAuth flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the dedicated `useOAuth: true` field on `SandboxAgentConfig` with runtime prefix detection on the resolved `secret.value`: - `sk-ant-api-…` → API-key path (microsandbox TLS injection, unchanged) - `sk-ant-oat-…` → OAuth path (plain `CLAUDE_CODE_OAUTH_TOKEN` env var) User experience is now one consistent shape — always set `secret.value` to a host env var reference; the runtime picks the auth mode from the resolved value at sandbox-create time: "executor": { "command": "claude", "secret": { "value": "$ANTHROPIC_API_KEY" } } "executor": { "command": "claude", "secret": { "value": "$CLAUDE_CODE_OAUTH_TOKEN" } } Mechanics: - New `applyAgentAuth(secret, adapter, secrets, env)` helper in `microsandbox.ts` consolidates the 3 sandbox-creation call sites (`execute.ts`, `judge.ts`, `sandbox.ts`) into a single call. The helper handles both auth modes internally. - New `isOAuthSecret(secret)` helper exposes the same prefix check for the judge's `buildJudgeAllowlist` (which adds the adapter's default hostname to the lockdown allowlist when in OAuth mode, since the OAuth path has no `secret.baseUrl` to derive from). - `SandboxAgentConfig.secret` becomes required again (no parallel field). - Removed `resolveOAuthToken()` and `buildAgentSecret()` — both subsumed by `applyAgentAuth`. Trade-off: depends on Anthropic's documented OAuth-vs-API-key prefix scheme (`sk-ant-oat-` vs `sk-ant-api-`). If that scheme changes, the eval silently misclassifies. Caller failure mode is an auth error from the Claude API at request time, which is observable in run logs. Tests: drop 4 useOAuth config tests + 2 resolveOAuthToken tests; add 3 tests for `isOAuthSecret` (OAuth value, API-key value, unset env var) and 3 tests for `applyAgentAuth` (OAuth path, API-key path, missing required fields). 331 tests pass; type-check + lint clean. --- README.md | 12 +-- skills/_reference/config-schema.md | 13 +-- src/commands/__tests__/execute.test.ts | 3 +- src/commands/execute.ts | 18 +--- src/commands/sandbox.ts | 15 +--- src/core/__tests__/config.test.ts | 54 +++--------- src/core/config.ts | 73 +++++++---------- src/sandbox/__tests__/microsandbox.test.ts | 95 ++++++++++++++++++---- src/sandbox/microsandbox.ts | 80 ++++++++++++------ src/scoring/__tests__/judge.test.ts | 3 +- src/scoring/judge.ts | 25 ++---- src/types.ts | 32 ++++---- 12 files changed, 218 insertions(+), 205 deletions(-) diff --git a/README.md b/README.md index 26b1838..f907d61 100644 --- a/README.md +++ b/README.md @@ -366,13 +366,13 @@ Generator and insights agents run locally and do not require a secret. #### Claude Code subscription auth (avoid API billing) -If you have a Claude Pro / Max / Team / Enterprise subscription, sandbox agents using `command: "claude"` can authenticate via your subscription instead of paying per-token API charges. Set `useOAuth: true` instead of providing a `secret`: +If you have a Claude Pro / Max / Team / Enterprise subscription, sandbox agents using `command: "claude"` can authenticate via your subscription instead of paying per-token API charges. Point `secret.value` at the Claude Code OAuth token instead of an API key: ```json { "agents": { - "executor": { "command": "claude", "useOAuth": true }, - "judge": { "command": "claude", "useOAuth": true } + "executor": { "command": "claude", "secret": { "value": "$CLAUDE_CODE_OAUTH_TOKEN" } }, + "judge": { "command": "claude", "secret": { "value": "$CLAUDE_CODE_OAUTH_TOKEN" } } } } ``` @@ -384,11 +384,7 @@ claude setup-token # interactive — generates a long-lived OAuth to export CLAUDE_CODE_OAUTH_TOKEN='' # before running the eval ``` -Notes: -- Token is injected into the sandbox as a plain env var (Claude reads it directly from `process.env`; the API-key TLS-substitution model does not apply). -- Only valid for `command: "claude"`. The framework rejects `useOAuth: true` on other adapters at config-load time. -- Setting both `secret` and `useOAuth` is rejected — choose one path per role. -- Subscription concurrent-session caps apply. +How it works: the runtime sniffs the resolved value's prefix at sandbox-create time. Anthropic OAuth tokens start with `sk-ant-oat-`; API keys start with `sk-ant-api-`. When the value is an OAuth token, it's injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var (Claude Code reads it directly from `process.env`; the API-key TLS-substitution model doesn't apply for OAuth). Subscription concurrent-session caps apply. #### Custom agents diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index c1bb0a5..8314c14 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -76,16 +76,18 @@ ### SandboxAgentConfig (executor, judge) -Extends AgentConfig with auth fields. **Exactly one of `secret` or `useOAuth` is required.** +Extends AgentConfig with one **required** field: | Field | Type | Required | |-------|------|----------| -| `secret` | `AgentSecretConfig` | One-of — API key via TLS injection | -| `useOAuth` | `boolean` | One-of — Claude Code subscription via `CLAUDE_CODE_OAUTH_TOKEN`. Only valid when `command: "claude"`. | +| `secret` | `AgentSecretConfig` | **Yes** | -**API-key path (`secret`)** — microsandbox TLS-injects the value, so the cleartext never enters the VM. Inside the sandbox the env var contains only a placeholder substituted on the wire for the allowed host. +Auth mode is auto-detected from the resolved `secret.value`'s prefix at sandbox-create time: -**Subscription path (`useOAuth: true`)** — reads `CLAUDE_CODE_OAUTH_TOKEN` from the host environment and injects it into the sandbox as a plain env var. Claude reads the token directly. Generate the token once with `claude setup-token` (Pro / Max / Team / Enterprise required), export it, then `useOAuth: true` on both executor and judge. Avoids per-token API billing. +- `sk-ant-api-…` (Anthropic API key) → microsandbox TLS-injects the value, so the cleartext never enters the VM. Inside the sandbox the env var contains only a placeholder substituted on the wire for the allowed host. +- `sk-ant-oat-…` (Claude Code subscription OAuth token, issued by `claude setup-token`) → injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var. Claude Code reads the token directly from `process.env`, so the TLS-substitution model does not apply. Avoids per-token API billing on Pro / Max / Team / Enterprise plans. + +Point `secret.value` at the host env var that holds the credential — `"$ANTHROPIC_API_KEY"` for the API-key path, `"$CLAUDE_CODE_OAUTH_TOKEN"` for the subscription path. ### AgentSecretConfig @@ -152,7 +154,6 @@ Custom agents (any command not in the table above) **must** provide `envVar` and 7. `agents.executor` and `agents.judge` must have `secret.value` (non-empty string) 8. Custom agents must provide `envVar` and `baseUrl` in their secret 9. `baseUrl` must be a parseable URL -10. Each sandbox agent role (`executor`, `judge`) must declare auth: either `secret` (API-key path) or `useOAuth: true` (Claude Code subscription path). Setting both is rejected. `useOAuth: true` requires `command: "claude"`. ## Minimal Examples diff --git a/src/commands/__tests__/execute.test.ts b/src/commands/__tests__/execute.test.ts index 494ea7c..e600125 100644 --- a/src/commands/__tests__/execute.test.ts +++ b/src/commands/__tests__/execute.test.ts @@ -41,7 +41,8 @@ vi.mock('../../sandbox/microsandbox.js', () => { return { MicrosandboxClient: MockMicrosandboxClient, buildSecrets: vi.fn().mockReturnValue([]), - buildAgentSecret: vi.fn().mockReturnValue({}), + applyAgentAuth: vi.fn(), + isOAuthSecret: vi.fn().mockReturnValue(false), resolveEnv: vi.fn().mockReturnValue({}), }; }); diff --git a/src/commands/execute.ts b/src/commands/execute.ts index 77f43a7..58654ee 100644 --- a/src/commands/execute.ts +++ b/src/commands/execute.ts @@ -3,7 +3,7 @@ import ora from 'ora'; import { loadDotenv } from '../core/env.js'; import { loadConfig } from '../core/config.js'; import { loadTestSuite, saveResult, saveBinaryResult, formatElapsed } from '../core/suite-io.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, resolveOAuthToken, type CommandResult } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv, type CommandResult } from '../sandbox/microsandbox.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; import { scaffoldWorkspace } from '../sandbox/scaffolding.js'; import { WorkerPool } from '../sandbox/worker-pool.js'; @@ -158,24 +158,10 @@ export async function executeTestCase( const env = resolveEnv(config.sandbox?.env); const timeoutSecs = target.timeout ?? config.sandbox.defaultTimeout ?? 600; - // Resolve agent auth. Two paths: - // - secret → microsandbox TLS-injected placeholder for an API key - // - useOAuth → plain CLAUDE_CODE_OAUTH_TOKEN env var (claude reads it directly) const executorConfig: SandboxAgentConfig = config.agents?.executor ?? { command: 'claude', secret: { value: '$ANTHROPIC_API_KEY' } }; const execAdapter = createAdapter(executorConfig); - if (executorConfig.useOAuth) { - env.CLAUDE_CODE_OAUTH_TOKEN = resolveOAuthToken(); - if (execAdapter.baseUrlEnvVar && execAdapter.defaultBaseUrl) { - env[execAdapter.baseUrlEnvVar] = execAdapter.defaultBaseUrl; - } - } else if (executorConfig.secret) { - secrets.push(buildAgentSecret(executorConfig.secret, execAdapter.additionalAllowHosts)); - const baseUrlVar = executorConfig.secret.baseUrlEnvVar ?? execAdapter.baseUrlEnvVar; - if (baseUrlVar && executorConfig.secret.baseUrl) { - env[baseUrlVar] = executorConfig.secret.baseUrl; - } - } + applyAgentAuth(executorConfig.secret, execAdapter, secrets, env); await client.create( sandboxName(testCase.id), diff --git a/src/commands/sandbox.ts b/src/commands/sandbox.ts index 2ca5c45..8645cf1 100644 --- a/src/commands/sandbox.ts +++ b/src/commands/sandbox.ts @@ -5,7 +5,7 @@ import { loadDotenv } from '../core/env.js'; import { loadConfig } from '../core/config.js'; import { loadTestSuite, loadBinaryResult } from '../core/suite-io.js'; import { loadJsonFile } from '../core/results.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, resolveOAuthToken } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv } from '../sandbox/microsandbox.js'; import { scaffoldWorkspace, uploadSources } from '../sandbox/scaffolding.js'; import { createEgressLogger } from '../sandbox/egress-logger.js'; import { createAdapter } from '../agents/adapter.js'; @@ -59,18 +59,7 @@ export async function sandboxCommand(paths: ProjectPaths, options: SandboxOption if (options.mode) { agentConfig = getAgentConfig(config, options.mode); adapter = createAdapter(agentConfig); - if (agentConfig.useOAuth) { - env.CLAUDE_CODE_OAUTH_TOKEN = resolveOAuthToken(); - if (adapter.baseUrlEnvVar && adapter.defaultBaseUrl) { - env[adapter.baseUrlEnvVar] = adapter.defaultBaseUrl; - } - } else if (agentConfig.secret) { - secrets.push(buildAgentSecret(agentConfig.secret, adapter.additionalAllowHosts)); - const baseUrlVar = agentConfig.secret.baseUrlEnvVar ?? adapter.baseUrlEnvVar; - if (baseUrlVar && agentConfig.secret.baseUrl) { - env[baseUrlVar] = agentConfig.secret.baseUrl; - } - } + applyAgentAuth(agentConfig.secret, adapter, secrets, env); } // Prepare output directory for artifacts diff --git a/src/core/__tests__/config.test.ts b/src/core/__tests__/config.test.ts index 7b08938..b8bf405 100644 --- a/src/core/__tests__/config.test.ts +++ b/src/core/__tests__/config.test.ts @@ -149,9 +149,9 @@ describe('loadConfig', () => { mockReadFile.mockResolvedValue(JSON.stringify(config)); const result = await loadConfig('/fake/config.json'); // Defaults should be filled in - expect(result.agents?.judge?.secret?.envVar).toBe('ANTHROPIC_API_KEY'); - expect(result.agents?.judge?.secret?.baseUrl).toBe('https://api.anthropic.com'); - expect(result.agents?.judge?.secret?.baseUrlEnvVar).toBe('ANTHROPIC_BASE_URL'); + expect(result.agents?.judge?.secret.envVar).toBe('ANTHROPIC_API_KEY'); + expect(result.agents?.judge?.secret.baseUrl).toBe('https://api.anthropic.com'); + expect(result.agents?.judge?.secret.baseUrlEnvVar).toBe('ANTHROPIC_BASE_URL'); }); it('accepts known agent with all secret fields explicit', async () => { @@ -166,7 +166,7 @@ describe('loadConfig', () => { }; mockReadFile.mockResolvedValue(JSON.stringify(config)); const result = await loadConfig('/fake/config.json'); - expect(result.agents?.judge?.secret?.envVar).toBe('ANTHROPIC_API_KEY'); + expect(result.agents?.judge?.secret.envVar).toBe('ANTHROPIC_API_KEY'); }); it('throws when sandbox agent (executor) is missing secret', async () => { @@ -224,43 +224,13 @@ describe('loadConfig', () => { await expect(loadConfig('/fake/config.json')).rejects.toThrow(/valid URL/); }); - describe('useOAuth (Claude Code subscription auth)', () => { - it('accepts judge with useOAuth: true and no secret', async () => { - const config = { - ...validConfig, - agents: { judge: { command: 'claude', useOAuth: true } }, - }; - mockReadFile.mockResolvedValue(JSON.stringify(config)); - const result = await loadConfig('/fake/config.json'); - expect(result.agents?.judge?.useOAuth).toBe(true); - expect(result.agents?.judge?.secret).toBeUndefined(); - }); - - it('rejects useOAuth with command != claude', async () => { - const config = { - ...validConfig, - agents: { judge: { command: 'codex', useOAuth: true } }, - }; - mockReadFile.mockResolvedValue(JSON.stringify(config)); - await expect(loadConfig('/fake/config.json')).rejects.toThrow(/useOAuth.*only supported for command: "claude"/); - }); - - it('rejects sandbox role with neither secret nor useOAuth', async () => { - const config = { - ...validConfig, - agents: { judge: { command: 'claude' } }, - }; - mockReadFile.mockResolvedValue(JSON.stringify(config)); - await expect(loadConfig('/fake/config.json')).rejects.toThrow(/secret.*or.*useOAuth/); - }); - - it('rejects setting both secret and useOAuth on the same role', async () => { - const config = { - ...validConfig, - agents: { judge: { command: 'claude', useOAuth: true, secret: { value: '$ANTHROPIC_API_KEY' } } }, - }; - mockReadFile.mockResolvedValue(JSON.stringify(config)); - await expect(loadConfig('/fake/config.json')).rejects.toThrow(/cannot set both useOAuth and secret/); - }); + it('accepts secret pointing at $CLAUDE_CODE_OAUTH_TOKEN (auth mode resolved later by value prefix)', async () => { + const config = { + ...validConfig, + agents: { judge: { command: 'claude', secret: { value: '$CLAUDE_CODE_OAUTH_TOKEN' } } }, + }; + mockReadFile.mockResolvedValue(JSON.stringify(config)); + const result = await loadConfig('/fake/config.json'); + expect(result.agents?.judge?.secret?.value).toBe('$CLAUDE_CODE_OAUTH_TOKEN'); }); }); diff --git a/src/core/config.ts b/src/core/config.ts index 47408c4..a1d618b 100644 --- a/src/core/config.ts +++ b/src/core/config.ts @@ -129,55 +129,38 @@ export function validateConfig(data: unknown, configPath?: string): Config { const isSandboxRole = SANDBOX_ROLES.includes(role); if (isSandboxRole) { - const useOAuth = agent.useOAuth === true; - - if (useOAuth) { - // OAuth path: Claude Code subscription via CLAUDE_CODE_OAUTH_TOKEN. - if (command !== 'claude') { - throw new Error( - `agents.${role}.useOAuth: true is only supported for command: "claude" (Claude Code subscription auth). ` + - `Got command: "${command ?? '(unset)'}".`, - ); - } - if (agent.secret !== undefined) { - throw new Error( - `agents.${role} cannot set both useOAuth and secret — choose one auth path.`, - ); - } + // Sandbox agents (executor/judge) require secret. Auth mode (API key vs Claude Code + // subscription OAuth token) is auto-detected from the resolved value's prefix at + // sandbox-create time. + if (!agent.secret || typeof agent.secret !== 'object' || Array.isArray(agent.secret)) { + throw new Error(`agents.${role} requires a secret with at least { value } for secure sandbox execution`); + } + const secret = agent.secret as Record; + if (!secret.value || typeof secret.value !== 'string') { + throw new Error(`agents.${role}.secret.value must be a non-empty string`); + } + + // Fill defaults from adapter for known agents + const adapter = createAdapter({ command } as AgentConfig); + if (adapter.defaultEnvVar) { + if (!secret.envVar) secret.envVar = adapter.defaultEnvVar; + if (!secret.baseUrl) secret.baseUrl = adapter.defaultBaseUrl; + if (!secret.baseUrlEnvVar) secret.baseUrlEnvVar = adapter.baseUrlEnvVar; } else { - // API-key path: secret with TLS-injected value. - if (!agent.secret || typeof agent.secret !== 'object' || Array.isArray(agent.secret)) { - throw new Error( - `agents.${role} requires either { secret: {...} } or { useOAuth: true } for sandbox auth`, - ); + // Custom agents must specify envVar and baseUrl + if (!secret.envVar || typeof secret.envVar !== 'string') { + throw new Error(`agents.${role}.secret.envVar is required for custom agent '${command}'`); } - const secret = agent.secret as Record; - if (!secret.value || typeof secret.value !== 'string') { - throw new Error(`agents.${role}.secret.value must be a non-empty string`); - } - - // Fill defaults from adapter for known agents - const adapter = createAdapter({ command } as AgentConfig); - if (adapter.defaultEnvVar) { - if (!secret.envVar) secret.envVar = adapter.defaultEnvVar; - if (!secret.baseUrl) secret.baseUrl = adapter.defaultBaseUrl; - if (!secret.baseUrlEnvVar) secret.baseUrlEnvVar = adapter.baseUrlEnvVar; - } else { - // Custom agents must specify envVar and baseUrl - if (!secret.envVar || typeof secret.envVar !== 'string') { - throw new Error(`agents.${role}.secret.envVar is required for custom agent '${command}'`); - } - if (!secret.baseUrl || typeof secret.baseUrl !== 'string') { - throw new Error(`agents.${role}.secret.baseUrl is required for custom agent '${command}'`); - } + if (!secret.baseUrl || typeof secret.baseUrl !== 'string') { + throw new Error(`agents.${role}.secret.baseUrl is required for custom agent '${command}'`); } + } - // Validate baseUrl is a valid URL - try { - new URL(secret.baseUrl as string); - } catch { - throw new Error(`agents.${role}.secret.baseUrl must be a valid URL`); - } + // Validate baseUrl is a valid URL + try { + new URL(secret.baseUrl as string); + } catch { + throw new Error(`agents.${role}.secret.baseUrl must be a valid URL`); } } } diff --git a/src/sandbox/__tests__/microsandbox.test.ts b/src/sandbox/__tests__/microsandbox.test.ts index 31f0736..991bab4 100644 --- a/src/sandbox/__tests__/microsandbox.test.ts +++ b/src/sandbox/__tests__/microsandbox.test.ts @@ -1,5 +1,6 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { MicrosandboxClient, buildSecrets, resolveEnv, resolveOAuthToken } from '../microsandbox.js'; +import type { SecretEntry } from 'microsandbox'; +import { MicrosandboxClient, buildSecrets, resolveEnv, applyAgentAuth, isOAuthSecret } from '../microsandbox.js'; // ── Mocks ──────────────────────────────────────────────────────────────────── @@ -330,24 +331,90 @@ describe('MicrosandboxClient', () => { }); }); -describe('resolveOAuthToken', () => { - const ORIGINAL_TOKEN = process.env.CLAUDE_CODE_OAUTH_TOKEN; +describe('agent secret auth-mode detection', () => { + const ORIGINAL_API_KEY = process.env.ANTHROPIC_API_KEY; + const ORIGINAL_OAUTH = process.env.CLAUDE_CODE_OAUTH_TOKEN; afterEach(() => { - if (ORIGINAL_TOKEN === undefined) { - delete process.env.CLAUDE_CODE_OAUTH_TOKEN; - } else { - process.env.CLAUDE_CODE_OAUTH_TOKEN = ORIGINAL_TOKEN; - } + const restore = (key: string, value: string | undefined) => { + if (value === undefined) delete process.env[key]; + else process.env[key] = value; + }; + restore('ANTHROPIC_API_KEY', ORIGINAL_API_KEY); + restore('CLAUDE_CODE_OAUTH_TOKEN', ORIGINAL_OAUTH); }); - it('returns the token when CLAUDE_CODE_OAUTH_TOKEN is set on the host', () => { - process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat-test-value'; - expect(resolveOAuthToken()).toBe('sk-ant-oat-test-value'); + const claudeAdapter = { + baseUrlEnvVar: 'ANTHROPIC_BASE_URL', + defaultBaseUrl: 'https://api.anthropic.com', + additionalAllowHosts: [], + }; + + describe('isOAuthSecret', () => { + it('returns true when the resolved value starts with sk-ant-oat-', () => { + process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat-fake-test-token'; + expect(isOAuthSecret({ + envVar: 'CLAUDE_CODE_OAUTH_TOKEN', + value: '$CLAUDE_CODE_OAUTH_TOKEN', + baseUrl: 'https://api.anthropic.com', + })).toBe(true); + }); + + it('returns false for an API-key shaped value', () => { + process.env.ANTHROPIC_API_KEY = 'sk-ant-api-fake-test-key'; + expect(isOAuthSecret({ + envVar: 'ANTHROPIC_API_KEY', + value: '$ANTHROPIC_API_KEY', + baseUrl: 'https://api.anthropic.com', + })).toBe(false); + }); + + it('returns false when the referenced host env var is unset (no throw)', () => { + delete process.env.CLAUDE_CODE_OAUTH_TOKEN; + expect(isOAuthSecret({ + envVar: 'CLAUDE_CODE_OAUTH_TOKEN', + value: '$CLAUDE_CODE_OAUTH_TOKEN', + baseUrl: 'https://api.anthropic.com', + })).toBe(false); + }); }); - it('throws with a clear setup-token hint when CLAUDE_CODE_OAUTH_TOKEN is unset', () => { - delete process.env.CLAUDE_CODE_OAUTH_TOKEN; - expect(() => resolveOAuthToken()).toThrow(/claude setup-token/); + describe('applyAgentAuth', () => { + it('injects CLAUDE_CODE_OAUTH_TOKEN as a plain env var when value is an OAuth token', () => { + process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat-fake-test-token'; + const secrets: SecretEntry[] = []; + const env: Record = {}; + applyAgentAuth({ + envVar: 'CLAUDE_CODE_OAUTH_TOKEN', + value: '$CLAUDE_CODE_OAUTH_TOKEN', + baseUrl: 'https://api.anthropic.com', + }, claudeAdapter, secrets, env); + expect(env.CLAUDE_CODE_OAUTH_TOKEN).toBe('sk-ant-oat-fake-test-token'); + expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); + expect(secrets).toHaveLength(0); + }); + + it('wraps an API-key value in Secret.env() with the agent host on allowHosts', () => { + process.env.ANTHROPIC_API_KEY = 'sk-ant-api-fake-test-key'; + const secrets: SecretEntry[] = []; + const env: Record = {}; + applyAgentAuth({ + envVar: 'ANTHROPIC_API_KEY', + value: '$ANTHROPIC_API_KEY', + baseUrl: 'https://api.anthropic.com', + baseUrlEnvVar: 'ANTHROPIC_BASE_URL', + }, claudeAdapter, secrets, env); + expect(secrets).toHaveLength(1); + expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); + expect(env.CLAUDE_CODE_OAUTH_TOKEN).toBeUndefined(); + }); + + it('throws when envVar or baseUrl is missing', () => { + const secrets: SecretEntry[] = []; + const env: Record = {}; + expect(() => applyAgentAuth({ + value: 'literal-value', + } as never, claudeAdapter, secrets, env)).toThrow(/envVar and baseUrl/); + }); }); }); \ No newline at end of file diff --git a/src/sandbox/microsandbox.ts b/src/sandbox/microsandbox.ts index 73f3a2e..1fc8efe 100644 --- a/src/sandbox/microsandbox.ts +++ b/src/sandbox/microsandbox.ts @@ -50,40 +50,68 @@ export function resolveEnv( } /** - * Build a microsandbox `Secret.env()` entry from an agent's secret config. - * The `allowHosts` is derived from the base URL hostname. + * Claude Code subscription OAuth tokens are prefixed `sk-ant-oat-` (issued by + * `claude setup-token`). API keys are prefixed `sk-ant-api-`. The auth mode is + * determined by inspecting the resolved secret value at sandbox-create time — + * no separate config flag needed. */ -export function buildAgentSecret(secret: AgentSecretConfig, additionalAllowHosts?: string[]): SecretEntry { - if (!secret.envVar || !secret.baseUrl) { - throw new Error('Agent secret must have envVar and baseUrl set (should be filled by config validation)'); +const OAUTH_TOKEN_PREFIX = 'sk-ant-oat-'; + +/** Whether the agent secret's resolved value is a Claude Code subscription OAuth token. */ +export function isOAuthSecret(secret: AgentSecretConfig): boolean { + if (!secret.envVar) return false; + try { + const value = resolveValue(secret.value, secret.envVar); + return value.startsWith(OAUTH_TOKEN_PREFIX); + } catch { + return false; } - const value = resolveValue(secret.value, secret.envVar); - const hostname = new URL(secret.baseUrl).hostname; - const allowHosts = [hostname, ...(additionalAllowHosts ?? [])]; - return Secret.env(secret.envVar, { value, allowHosts }); +} + +interface AgentAuthAdapter { + baseUrlEnvVar: string | null; + defaultBaseUrl: string | null; + additionalAllowHosts: string[]; } /** - * Resolve the Claude Code OAuth token from the host environment for - * `useOAuth: true` agent configs. Unlike API keys (TLS-injected as - * placeholders by microsandbox), OAuth tokens must enter the VM as the real - * value because Claude reads them directly from `process.env`. The caller - * places the returned value under `sandbox.env.CLAUDE_CODE_OAUTH_TOKEN`. + * Wire an agent's secret into the sandbox `secrets` and `env`, picking the auth + * mode by inspecting the resolved value: + * + * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat-`) → plain + * `CLAUDE_CODE_OAUTH_TOKEN` env var. Claude Code reads the token directly + * from `process.env`, so microsandbox's TLS-substitution model doesn't apply. + * - Everything else (API keys for known agents, custom-agent secrets) → wrapped + * in `Secret.env()` with TLS substitution and the configured base URL env var. * - * Throws with a clear message if `CLAUDE_CODE_OAUTH_TOKEN` is not set on - * the host — fail-fast so the user knows to run `claude setup-token` and - * export the result before the eval starts. + * Mutates `secrets` and `env` in place. */ -export function resolveOAuthToken(): string { - const value = process.env.CLAUDE_CODE_OAUTH_TOKEN; - if (!value) { - throw new Error( - "CLAUDE_CODE_OAUTH_TOKEN is not set on the host. " + - "Generate a long-lived subscription token with `claude setup-token` " + - "and `export CLAUDE_CODE_OAUTH_TOKEN=` before running the eval.", - ); +export function applyAgentAuth( + secret: AgentSecretConfig, + adapter: AgentAuthAdapter, + secrets: SecretEntry[], + env: Record, +): void { + if (!secret.envVar || !secret.baseUrl) { + throw new Error('Agent secret must have envVar and baseUrl set (should be filled by config validation)'); + } + const value = resolveValue(secret.value, secret.envVar); + + if (value.startsWith(OAUTH_TOKEN_PREFIX)) { + env.CLAUDE_CODE_OAUTH_TOKEN = value; + if (adapter.baseUrlEnvVar && adapter.defaultBaseUrl) { + env[adapter.baseUrlEnvVar] = adapter.defaultBaseUrl; + } + return; + } + + const hostname = new URL(secret.baseUrl).hostname; + const allowHosts = [hostname, ...adapter.additionalAllowHosts]; + secrets.push(Secret.env(secret.envVar, { value, allowHosts })); + const baseUrlVar = secret.baseUrlEnvVar ?? adapter.baseUrlEnvVar; + if (baseUrlVar) { + env[baseUrlVar] = secret.baseUrl; } - return value; } function resolveValue(value: string, envVar: string): string { diff --git a/src/scoring/__tests__/judge.test.ts b/src/scoring/__tests__/judge.test.ts index 87c5b1e..ad13b8b 100644 --- a/src/scoring/__tests__/judge.test.ts +++ b/src/scoring/__tests__/judge.test.ts @@ -23,7 +23,8 @@ vi.mock('../../sandbox/microsandbox.js', () => ({ Object.assign(this, mockClient); }), buildSecrets: vi.fn().mockReturnValue([]), - buildAgentSecret: vi.fn().mockReturnValue({}), + applyAgentAuth: vi.fn(), + isOAuthSecret: vi.fn().mockReturnValue(false), resolveEnv: vi.fn().mockReturnValue({}), })); diff --git a/src/scoring/judge.ts b/src/scoring/judge.ts index dcf553d..eaae607 100644 --- a/src/scoring/judge.ts +++ b/src/scoring/judge.ts @@ -1,7 +1,7 @@ import type { SolutionFile, JudgeScore, TestCase, SandboxAgentConfig, TargetConfig, Config, ProjectPaths, SourceConfig } from '../types.js'; import { createAdapter } from '../agents/adapter.js'; import { JUDGE_SCORING_CRITERIA, extractJson } from '../commands/prompt-helpers.js'; -import { MicrosandboxClient, buildSecrets, buildAgentSecret, resolveEnv, resolveOAuthToken } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, applyAgentAuth, isOAuthSecret, resolveEnv } from '../sandbox/microsandbox.js'; import { createEgressLockdownLogger } from '../sandbox/egress-logger.js'; import { scaffoldWorkspace, uploadSources } from '../sandbox/scaffolding.js'; import { deduplicateSources } from '../core/source-resolver.js'; @@ -135,14 +135,15 @@ const INFRA_ALLOWLIST = [ export function buildJudgeAllowlist(judgeConfig: SandboxAgentConfig, config: Config): string[] { const hosts = new Set(); - // 1. Agent API endpoint — from secret.baseUrl (API-key path) or adapter default (OAuth path). - if (judgeConfig.secret?.baseUrl) { - try { hosts.add(new URL(judgeConfig.secret.baseUrl).hostname); } catch { /* skip malformed */ } - } else if (judgeConfig.useOAuth) { + // 1. Agent API endpoint — adapter default for OAuth tokens (Claude reads directly + // from process.env, so secret.baseUrl is irrelevant), else secret.baseUrl. + if (isOAuthSecret(judgeConfig.secret)) { const adapter = createAdapter(judgeConfig); if (adapter.defaultBaseUrl) { try { hosts.add(new URL(adapter.defaultBaseUrl).hostname); } catch { /* skip malformed */ } } + } else if (judgeConfig.secret.baseUrl) { + try { hosts.add(new URL(judgeConfig.secret.baseUrl).hostname); } catch { /* skip malformed */ } } // 2. Secrets allowHosts @@ -291,20 +292,8 @@ export async function runSandboxedJudge( const env = resolveEnv(config.sandbox?.env); const timeoutSecs = target.timeout ?? config.sandbox.defaultTimeout ?? 600; - // Resolve agent auth — same two-path model as the executor. const judgeAdapter = createAdapter(judgeConfig); - if (judgeConfig.useOAuth) { - env.CLAUDE_CODE_OAUTH_TOKEN = resolveOAuthToken(); - if (judgeAdapter.baseUrlEnvVar && judgeAdapter.defaultBaseUrl) { - env[judgeAdapter.baseUrlEnvVar] = judgeAdapter.defaultBaseUrl; - } - } else if (judgeConfig.secret) { - secrets.push(buildAgentSecret(judgeConfig.secret, judgeAdapter.additionalAllowHosts)); - const baseUrlVar = judgeConfig.secret.baseUrlEnvVar ?? judgeAdapter.baseUrlEnvVar; - if (baseUrlVar && judgeConfig.secret.baseUrl) { - env[baseUrlVar] = judgeConfig.secret.baseUrl; - } - } + applyAgentAuth(judgeConfig.secret, judgeAdapter, secrets, env); await client.create( sandboxName(testCase.id), diff --git a/src/types.ts b/src/types.ts index 4e04224..966ecd6 100644 --- a/src/types.ts +++ b/src/types.ts @@ -111,24 +111,26 @@ export interface AgentConfig { /** Agent config for sandboxed execution (executor/judge). * - * Auth: exactly one of `secret` or `useOAuth: true` is required. + * Auth mode is auto-detected from the resolved `secret.value`: * - * - `secret` is the API-key path — values are TLS-injected by microsandbox so - * the cleartext never enters the VM, and the env contains a placeholder - * substituted on the wire only for the agent's allowed host. - * - `useOAuth: true` is the Claude Code subscription path — reads - * `CLAUDE_CODE_OAUTH_TOKEN` (generated by `claude setup-token`, requires - * Pro / Max / Team / Enterprise) from the host environment and injects it - * into the sandbox as a plain env var. Subscription auth is required here - * because Claude reads the token directly from `process.env`; the TLS - * substitution model does not work for OAuth. Only valid when - * `command: "claude"`. + * - API keys (anything not matching the OAuth prefix) are TLS-injected by + * microsandbox — cleartext never enters the VM; the env var inside the + * sandbox contains a placeholder substituted on the wire only for the + * agent's allowed host. + * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat-`, issued by + * `claude setup-token`) are injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` + * env var. Claude Code reads the token directly from `process.env`, so the + * TLS-substitution model does not apply. Avoids per-token API billing on + * Pro / Max / Team / Enterprise plans. + * + * Point `secret.value` at the host env var that holds the credential — + * `$ANTHROPIC_API_KEY` for the API-key path, `$CLAUDE_CODE_OAUTH_TOKEN` for + * the subscription path. The runtime sniffs the resolved value to pick the + * path. */ export interface SandboxAgentConfig extends AgentConfig { - /** Agent's API secret and base URL. Flows to microsandbox TLS injection, sandbox env, and judge lockdown allowlist. */ - secret?: AgentSecretConfig; - /** Use Claude Code subscription auth via `CLAUDE_CODE_OAUTH_TOKEN`. Only valid for `command: "claude"`. */ - useOAuth?: boolean; + /** Agent's secret and base URL. Auth mode is determined from the resolved value's prefix. */ + secret: AgentSecretConfig; } export interface TargetConfig { From 7d510fff304a6723de6ae74f5cde5b0ec4fa71f8 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 10:42:25 +1200 Subject: [PATCH 3/6] fix(auth): OAuth token prefix is sk-ant-oat (followed by version), not sk-ant-oat- MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smoke-tested against a real `claude setup-token` token and discovered the prefix is `sk-ant-oat01-…`, not `sk-ant-oat-…`. The trailing dash in OAUTH_TOKEN_PREFIX caused all real OAuth tokens to misclassify as API keys and route through the TLS-injection path. Dropping the trailing dash: - matches every documented variant: `sk-ant-oat01-…`, `sk-ant-oat02-…`, etc. - still cleanly distinguishes from API keys (`sk-ant-api…`) since `oat` ≠ `api`. Test fixtures and docs updated to the real `sk-ant-oat01-` form. Verified end-to-end with the smoke script: isOAuthSecret returns true, applyAgentAuth populates env.CLAUDE_CODE_OAUTH_TOKEN as plain env var, no TLS secrets added. 331 tests pass; type-check + lint clean. --- README.md | 2 +- skills/_reference/config-schema.md | 4 ++-- src/sandbox/__tests__/microsandbox.test.ts | 8 ++++---- src/sandbox/microsandbox.ts | 14 ++++++++------ src/types.ts | 3 ++- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f907d61..91671f6 100644 --- a/README.md +++ b/README.md @@ -384,7 +384,7 @@ claude setup-token # interactive — generates a long-lived OAuth to export CLAUDE_CODE_OAUTH_TOKEN='' # before running the eval ``` -How it works: the runtime sniffs the resolved value's prefix at sandbox-create time. Anthropic OAuth tokens start with `sk-ant-oat-`; API keys start with `sk-ant-api-`. When the value is an OAuth token, it's injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var (Claude Code reads it directly from `process.env`; the API-key TLS-substitution model doesn't apply for OAuth). Subscription concurrent-session caps apply. +How it works: the runtime sniffs the resolved value's prefix at sandbox-create time. Anthropic OAuth tokens start with `sk-ant-oat` (e.g. `sk-ant-oat01-…`); API keys start with `sk-ant-api` (e.g. `sk-ant-api03-…`). When the value is an OAuth token, it's injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var (Claude Code reads it directly from `process.env`; the API-key TLS-substitution model doesn't apply for OAuth). Subscription concurrent-session caps apply. #### Custom agents diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 8314c14..23fad17 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -84,8 +84,8 @@ Extends AgentConfig with one **required** field: Auth mode is auto-detected from the resolved `secret.value`'s prefix at sandbox-create time: -- `sk-ant-api-…` (Anthropic API key) → microsandbox TLS-injects the value, so the cleartext never enters the VM. Inside the sandbox the env var contains only a placeholder substituted on the wire for the allowed host. -- `sk-ant-oat-…` (Claude Code subscription OAuth token, issued by `claude setup-token`) → injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var. Claude Code reads the token directly from `process.env`, so the TLS-substitution model does not apply. Avoids per-token API billing on Pro / Max / Team / Enterprise plans. +- `sk-ant-api…` (Anthropic API key, e.g. `sk-ant-api03-…`) → microsandbox TLS-injects the value, so the cleartext never enters the VM. Inside the sandbox the env var contains only a placeholder substituted on the wire for the allowed host. +- `sk-ant-oat…` (Claude Code subscription OAuth token, e.g. `sk-ant-oat01-…`, issued by `claude setup-token`) → injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var. Claude Code reads the token directly from `process.env`, so the TLS-substitution model does not apply. Avoids per-token API billing on Pro / Max / Team / Enterprise plans. Point `secret.value` at the host env var that holds the credential — `"$ANTHROPIC_API_KEY"` for the API-key path, `"$CLAUDE_CODE_OAUTH_TOKEN"` for the subscription path. diff --git a/src/sandbox/__tests__/microsandbox.test.ts b/src/sandbox/__tests__/microsandbox.test.ts index 991bab4..d8a7865 100644 --- a/src/sandbox/__tests__/microsandbox.test.ts +++ b/src/sandbox/__tests__/microsandbox.test.ts @@ -351,8 +351,8 @@ describe('agent secret auth-mode detection', () => { }; describe('isOAuthSecret', () => { - it('returns true when the resolved value starts with sk-ant-oat-', () => { - process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat-fake-test-token'; + it('returns true when the resolved value starts with sk-ant-oat (followed by a version, e.g. sk-ant-oat01-)', () => { + process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat01-fake-test-token'; expect(isOAuthSecret({ envVar: 'CLAUDE_CODE_OAUTH_TOKEN', value: '$CLAUDE_CODE_OAUTH_TOKEN', @@ -381,7 +381,7 @@ describe('agent secret auth-mode detection', () => { describe('applyAgentAuth', () => { it('injects CLAUDE_CODE_OAUTH_TOKEN as a plain env var when value is an OAuth token', () => { - process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat-fake-test-token'; + process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat01-fake-test-token'; const secrets: SecretEntry[] = []; const env: Record = {}; applyAgentAuth({ @@ -389,7 +389,7 @@ describe('agent secret auth-mode detection', () => { value: '$CLAUDE_CODE_OAUTH_TOKEN', baseUrl: 'https://api.anthropic.com', }, claudeAdapter, secrets, env); - expect(env.CLAUDE_CODE_OAUTH_TOKEN).toBe('sk-ant-oat-fake-test-token'); + expect(env.CLAUDE_CODE_OAUTH_TOKEN).toBe('sk-ant-oat01-fake-test-token'); expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); expect(secrets).toHaveLength(0); }); diff --git a/src/sandbox/microsandbox.ts b/src/sandbox/microsandbox.ts index 1fc8efe..0a8dfb0 100644 --- a/src/sandbox/microsandbox.ts +++ b/src/sandbox/microsandbox.ts @@ -50,12 +50,13 @@ export function resolveEnv( } /** - * Claude Code subscription OAuth tokens are prefixed `sk-ant-oat-` (issued by - * `claude setup-token`). API keys are prefixed `sk-ant-api-`. The auth mode is - * determined by inspecting the resolved secret value at sandbox-create time — - * no separate config flag needed. + * Claude Code subscription OAuth tokens are prefixed `sk-ant-oat` followed by a + * version number (e.g. `sk-ant-oat01-…`), issued by `claude setup-token`. API + * keys use `sk-ant-api` (e.g. `sk-ant-api03-…`). The auth mode is determined + * by inspecting the resolved secret value at sandbox-create time — no separate + * config flag needed. */ -const OAUTH_TOKEN_PREFIX = 'sk-ant-oat-'; +const OAUTH_TOKEN_PREFIX = 'sk-ant-oat'; /** Whether the agent secret's resolved value is a Claude Code subscription OAuth token. */ export function isOAuthSecret(secret: AgentSecretConfig): boolean { @@ -78,7 +79,8 @@ interface AgentAuthAdapter { * Wire an agent's secret into the sandbox `secrets` and `env`, picking the auth * mode by inspecting the resolved value: * - * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat-`) → plain + * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat`, e.g. + * `sk-ant-oat01-…`) → plain * `CLAUDE_CODE_OAUTH_TOKEN` env var. Claude Code reads the token directly * from `process.env`, so microsandbox's TLS-substitution model doesn't apply. * - Everything else (API keys for known agents, custom-agent secrets) → wrapped diff --git a/src/types.ts b/src/types.ts index 966ecd6..7b92ef4 100644 --- a/src/types.ts +++ b/src/types.ts @@ -117,7 +117,8 @@ export interface AgentConfig { * microsandbox — cleartext never enters the VM; the env var inside the * sandbox contains a placeholder substituted on the wire only for the * agent's allowed host. - * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat-`, issued by + * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat`, e.g. + * `sk-ant-oat01-…`, issued by * `claude setup-token`) are injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` * env var. Claude Code reads the token directly from `process.env`, so the * TLS-substitution model does not apply. Avoids per-token API billing on From 6bc5c3aebd47a378ccea4d69964df0225dac0ba1 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 11:24:55 +1200 Subject: [PATCH 4/6] refactor(auth): unify OAuth and API-key paths via TLS substitution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original PR routed OAuth through plain env-var injection on the assumption that Claude Code's CLAUDE_CODE_OAUTH_TOKEN reader was incompatible with microsandbox's wire-time placeholder substitution. Smoke-testing the placeholder path against a real Claude Code session proved that wrong: Claude tolerates a `\$MSB_CLAUDE_CODE_OAUTH_TOKEN` placeholder as the env var value, constructs `Authorization: Bearer \$MSB_…` as the outbound header, and microsandbox substitutes the placeholder for the real OAuth token at TLS interception time — Anthropic returned 200 on `/api/eval/sdk-…` and the eval completed end-to-end. Collapse the two-mode dispatch in `applyAgentAuth` into one TLS-substituted path. The resolved value's prefix only picks the env var name that carries the placeholder: - `sk-ant-oat…` → `CLAUDE_CODE_OAUTH_TOKEN` - anything else → `secret.envVar` (= `ANTHROPIC_API_KEY` for claude, etc.) Benefits: - OAuth recovers the same "cleartext never enters the VM" security property API keys already had — the real subscription token only ever touches the outbound TLS layer to api.anthropic.com. - One code path, fewer test modes. \`isOAuthSecret\` was only used to pick the env var name (now inlined as a local conditional) and to choose the allowlist hostname in \`buildJudgeAllowlist\` — but both auth paths now derive that hostname from \`secret.baseUrl\` (validation already fills it from the adapter default for known agents), so the OAuth branch in \`buildJudgeAllowlist\` is gone too. - Less surface area in the public module API (\`isOAuthSecret\` removed from exports). Tests collapse from a four-test isOAuthSecret + applyAgentAuth suite to three unified \`applyAgentAuth\` cases (OAuth value → CLAUDE_CODE_OAUTH_TOKEN slot; API-key value → adapter slot; precondition error). 328 unit tests pass; type-check + lint clean. End-to-end verified against TC-001 with a real CLAUDE_CODE_OAUTH_TOKEN: exit 0, 27s, real solution produced; egress log shows \`Authorization: Bearer \$MSB_CLAUDE_CODE_OAUTH_TOKEN\` (pre-substitution), \`/api/claude_code/settings\` returns 404 (auth accepted; would be 401 if the placeholder leaked to the wire). --- README.md | 2 +- skills/_reference/config-schema.md | 8 +- src/sandbox/__tests__/microsandbox.test.ts | 106 ++++++++------------- src/sandbox/microsandbox.ts | 44 ++++----- src/scoring/judge.ts | 14 +-- src/types.ts | 26 ++--- 6 files changed, 82 insertions(+), 118 deletions(-) diff --git a/README.md b/README.md index 91671f6..982bb7e 100644 --- a/README.md +++ b/README.md @@ -384,7 +384,7 @@ claude setup-token # interactive — generates a long-lived OAuth to export CLAUDE_CODE_OAUTH_TOKEN='' # before running the eval ``` -How it works: the runtime sniffs the resolved value's prefix at sandbox-create time. Anthropic OAuth tokens start with `sk-ant-oat` (e.g. `sk-ant-oat01-…`); API keys start with `sk-ant-api` (e.g. `sk-ant-api03-…`). When the value is an OAuth token, it's injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var (Claude Code reads it directly from `process.env`; the API-key TLS-substitution model doesn't apply for OAuth). Subscription concurrent-session caps apply. +How it works: the runtime sniffs the resolved value's prefix at sandbox-create time. Anthropic OAuth tokens start with `sk-ant-oat` (e.g. `sk-ant-oat01-…`); API keys start with `sk-ant-api` (e.g. `sk-ant-api03-…`). Both paths flow through microsandbox's `Secret.env()` TLS substitution — cleartext never enters the VM; the env var inside the sandbox contains a placeholder, and microsandbox swaps it for the real value on outbound TLS to `api.anthropic.com` only. The prefix only decides which env var name (`CLAUDE_CODE_OAUTH_TOKEN` vs `ANTHROPIC_API_KEY`) carries the placeholder. Subscription concurrent-session caps apply. #### Custom agents diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 23fad17..79af65b 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -82,10 +82,12 @@ Extends AgentConfig with one **required** field: |-------|------|----------| | `secret` | `AgentSecretConfig` | **Yes** | -Auth mode is auto-detected from the resolved `secret.value`'s prefix at sandbox-create time: +Both auth modes flow through microsandbox `Secret.env()` TLS substitution — the cleartext credential never enters the VM. Inside the sandbox the env var contains a `$MSB_` placeholder; microsandbox swaps it for the real value on outbound TLS to the allowed host only. -- `sk-ant-api…` (Anthropic API key, e.g. `sk-ant-api03-…`) → microsandbox TLS-injects the value, so the cleartext never enters the VM. Inside the sandbox the env var contains only a placeholder substituted on the wire for the allowed host. -- `sk-ant-oat…` (Claude Code subscription OAuth token, e.g. `sk-ant-oat01-…`, issued by `claude setup-token`) → injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` env var. Claude Code reads the token directly from `process.env`, so the TLS-substitution model does not apply. Avoids per-token API billing on Pro / Max / Team / Enterprise plans. +The resolved `secret.value`'s prefix picks which env var name carries the placeholder: + +- `sk-ant-api…` (Anthropic API key, e.g. `sk-ant-api03-…`) → `ANTHROPIC_API_KEY`. +- `sk-ant-oat…` (Claude Code subscription OAuth token, e.g. `sk-ant-oat01-…`, issued by `claude setup-token`) → `CLAUDE_CODE_OAUTH_TOKEN`. Avoids per-token API billing on Pro / Max / Team / Enterprise plans. Point `secret.value` at the host env var that holds the credential — `"$ANTHROPIC_API_KEY"` for the API-key path, `"$CLAUDE_CODE_OAUTH_TOKEN"` for the subscription path. diff --git a/src/sandbox/__tests__/microsandbox.test.ts b/src/sandbox/__tests__/microsandbox.test.ts index d8a7865..50346bf 100644 --- a/src/sandbox/__tests__/microsandbox.test.ts +++ b/src/sandbox/__tests__/microsandbox.test.ts @@ -1,6 +1,6 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; import type { SecretEntry } from 'microsandbox'; -import { MicrosandboxClient, buildSecrets, resolveEnv, applyAgentAuth, isOAuthSecret } from '../microsandbox.js'; +import { MicrosandboxClient, buildSecrets, resolveEnv, applyAgentAuth } from '../microsandbox.js'; // ── Mocks ──────────────────────────────────────────────────────────────────── @@ -331,7 +331,7 @@ describe('MicrosandboxClient', () => { }); }); -describe('agent secret auth-mode detection', () => { +describe('applyAgentAuth', () => { const ORIGINAL_API_KEY = process.env.ANTHROPIC_API_KEY; const ORIGINAL_OAUTH = process.env.CLAUDE_CODE_OAUTH_TOKEN; @@ -350,71 +350,47 @@ describe('agent secret auth-mode detection', () => { additionalAllowHosts: [], }; - describe('isOAuthSecret', () => { - it('returns true when the resolved value starts with sk-ant-oat (followed by a version, e.g. sk-ant-oat01-)', () => { - process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat01-fake-test-token'; - expect(isOAuthSecret({ - envVar: 'CLAUDE_CODE_OAUTH_TOKEN', - value: '$CLAUDE_CODE_OAUTH_TOKEN', - baseUrl: 'https://api.anthropic.com', - })).toBe(true); - }); - - it('returns false for an API-key shaped value', () => { - process.env.ANTHROPIC_API_KEY = 'sk-ant-api-fake-test-key'; - expect(isOAuthSecret({ - envVar: 'ANTHROPIC_API_KEY', - value: '$ANTHROPIC_API_KEY', - baseUrl: 'https://api.anthropic.com', - })).toBe(false); - }); - - it('returns false when the referenced host env var is unset (no throw)', () => { - delete process.env.CLAUDE_CODE_OAUTH_TOKEN; - expect(isOAuthSecret({ - envVar: 'CLAUDE_CODE_OAUTH_TOKEN', - value: '$CLAUDE_CODE_OAUTH_TOKEN', - baseUrl: 'https://api.anthropic.com', - })).toBe(false); - }); + // The microsandbox `SecretEntry` is opaque, but inspecting its keys gives us + // enough confidence that the right env var name is being TLS-substituted. + const secretEnvVarName = (entry: SecretEntry): string | undefined => + (entry as { envVar?: string; env_var?: string; name?: string }).envVar + ?? (entry as { env_var?: string }).env_var + ?? (entry as { name?: string }).name; + + it('routes an OAuth-prefixed value through Secret.env under CLAUDE_CODE_OAUTH_TOKEN', () => { + process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat01-fake-test-token'; + const secrets: SecretEntry[] = []; + const env: Record = {}; + applyAgentAuth({ + envVar: 'CLAUDE_CODE_OAUTH_TOKEN', + value: '$CLAUDE_CODE_OAUTH_TOKEN', + baseUrl: 'https://api.anthropic.com', + }, claudeAdapter, secrets, env); + expect(secrets).toHaveLength(1); + expect(secretEnvVarName(secrets[0])).toBe('CLAUDE_CODE_OAUTH_TOKEN'); + expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); }); - describe('applyAgentAuth', () => { - it('injects CLAUDE_CODE_OAUTH_TOKEN as a plain env var when value is an OAuth token', () => { - process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat01-fake-test-token'; - const secrets: SecretEntry[] = []; - const env: Record = {}; - applyAgentAuth({ - envVar: 'CLAUDE_CODE_OAUTH_TOKEN', - value: '$CLAUDE_CODE_OAUTH_TOKEN', - baseUrl: 'https://api.anthropic.com', - }, claudeAdapter, secrets, env); - expect(env.CLAUDE_CODE_OAUTH_TOKEN).toBe('sk-ant-oat01-fake-test-token'); - expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); - expect(secrets).toHaveLength(0); - }); - - it('wraps an API-key value in Secret.env() with the agent host on allowHosts', () => { - process.env.ANTHROPIC_API_KEY = 'sk-ant-api-fake-test-key'; - const secrets: SecretEntry[] = []; - const env: Record = {}; - applyAgentAuth({ - envVar: 'ANTHROPIC_API_KEY', - value: '$ANTHROPIC_API_KEY', - baseUrl: 'https://api.anthropic.com', - baseUrlEnvVar: 'ANTHROPIC_BASE_URL', - }, claudeAdapter, secrets, env); - expect(secrets).toHaveLength(1); - expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); - expect(env.CLAUDE_CODE_OAUTH_TOKEN).toBeUndefined(); - }); + it('routes an API-key value through Secret.env under the agent-specific env var', () => { + process.env.ANTHROPIC_API_KEY = 'sk-ant-api03-fake-test-key'; + const secrets: SecretEntry[] = []; + const env: Record = {}; + applyAgentAuth({ + envVar: 'ANTHROPIC_API_KEY', + value: '$ANTHROPIC_API_KEY', + baseUrl: 'https://api.anthropic.com', + baseUrlEnvVar: 'ANTHROPIC_BASE_URL', + }, claudeAdapter, secrets, env); + expect(secrets).toHaveLength(1); + expect(secretEnvVarName(secrets[0])).toBe('ANTHROPIC_API_KEY'); + expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); + }); - it('throws when envVar or baseUrl is missing', () => { - const secrets: SecretEntry[] = []; - const env: Record = {}; - expect(() => applyAgentAuth({ - value: 'literal-value', - } as never, claudeAdapter, secrets, env)).toThrow(/envVar and baseUrl/); - }); + it('throws when envVar or baseUrl is missing', () => { + const secrets: SecretEntry[] = []; + const env: Record = {}; + expect(() => applyAgentAuth({ + value: 'literal-value', + } as never, claudeAdapter, secrets, env)).toThrow(/envVar and baseUrl/); }); }); \ No newline at end of file diff --git a/src/sandbox/microsandbox.ts b/src/sandbox/microsandbox.ts index 0a8dfb0..f661894 100644 --- a/src/sandbox/microsandbox.ts +++ b/src/sandbox/microsandbox.ts @@ -58,17 +58,6 @@ export function resolveEnv( */ const OAUTH_TOKEN_PREFIX = 'sk-ant-oat'; -/** Whether the agent secret's resolved value is a Claude Code subscription OAuth token. */ -export function isOAuthSecret(secret: AgentSecretConfig): boolean { - if (!secret.envVar) return false; - try { - const value = resolveValue(secret.value, secret.envVar); - return value.startsWith(OAUTH_TOKEN_PREFIX); - } catch { - return false; - } -} - interface AgentAuthAdapter { baseUrlEnvVar: string | null; defaultBaseUrl: string | null; @@ -76,15 +65,19 @@ interface AgentAuthAdapter { } /** - * Wire an agent's secret into the sandbox `secrets` and `env`, picking the auth - * mode by inspecting the resolved value: + * Wire an agent's secret into the sandbox `secrets` and `env`. + * + * Both auth modes (API key and Claude Code subscription OAuth) go through + * microsandbox `Secret.env()` TLS substitution — the cleartext value never + * enters the VM. Inside the sandbox the env var contains the + * `$MSB_` placeholder; microsandbox swaps it for the real value + * on outbound TLS to the allowed host only. * - * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat`, e.g. - * `sk-ant-oat01-…`) → plain - * `CLAUDE_CODE_OAUTH_TOKEN` env var. Claude Code reads the token directly - * from `process.env`, so microsandbox's TLS-substitution model doesn't apply. - * - Everything else (API keys for known agents, custom-agent secrets) → wrapped - * in `Secret.env()` with TLS substitution and the configured base URL env var. + * The resolved value's prefix picks which env var name carries the placeholder: + * - `sk-ant-oat…` (Claude Code subscription OAuth, issued by `claude setup-token`) + * → `CLAUDE_CODE_OAUTH_TOKEN` + * - anything else (API keys for known agents, custom-agent secrets) + * → `secret.envVar` (= `ANTHROPIC_API_KEY` for claude, etc.) * * Mutates `secrets` and `env` in place. */ @@ -99,17 +92,14 @@ export function applyAgentAuth( } const value = resolveValue(secret.value, secret.envVar); - if (value.startsWith(OAUTH_TOKEN_PREFIX)) { - env.CLAUDE_CODE_OAUTH_TOKEN = value; - if (adapter.baseUrlEnvVar && adapter.defaultBaseUrl) { - env[adapter.baseUrlEnvVar] = adapter.defaultBaseUrl; - } - return; - } + const envVar = value.startsWith(OAUTH_TOKEN_PREFIX) + ? 'CLAUDE_CODE_OAUTH_TOKEN' + : secret.envVar; const hostname = new URL(secret.baseUrl).hostname; const allowHosts = [hostname, ...adapter.additionalAllowHosts]; - secrets.push(Secret.env(secret.envVar, { value, allowHosts })); + secrets.push(Secret.env(envVar, { value, allowHosts })); + const baseUrlVar = secret.baseUrlEnvVar ?? adapter.baseUrlEnvVar; if (baseUrlVar) { env[baseUrlVar] = secret.baseUrl; diff --git a/src/scoring/judge.ts b/src/scoring/judge.ts index eaae607..7650154 100644 --- a/src/scoring/judge.ts +++ b/src/scoring/judge.ts @@ -1,7 +1,7 @@ import type { SolutionFile, JudgeScore, TestCase, SandboxAgentConfig, TargetConfig, Config, ProjectPaths, SourceConfig } from '../types.js'; import { createAdapter } from '../agents/adapter.js'; import { JUDGE_SCORING_CRITERIA, extractJson } from '../commands/prompt-helpers.js'; -import { MicrosandboxClient, buildSecrets, applyAgentAuth, isOAuthSecret, resolveEnv } from '../sandbox/microsandbox.js'; +import { MicrosandboxClient, buildSecrets, applyAgentAuth, resolveEnv } from '../sandbox/microsandbox.js'; import { createEgressLockdownLogger } from '../sandbox/egress-logger.js'; import { scaffoldWorkspace, uploadSources } from '../sandbox/scaffolding.js'; import { deduplicateSources } from '../core/source-resolver.js'; @@ -135,14 +135,10 @@ const INFRA_ALLOWLIST = [ export function buildJudgeAllowlist(judgeConfig: SandboxAgentConfig, config: Config): string[] { const hosts = new Set(); - // 1. Agent API endpoint — adapter default for OAuth tokens (Claude reads directly - // from process.env, so secret.baseUrl is irrelevant), else secret.baseUrl. - if (isOAuthSecret(judgeConfig.secret)) { - const adapter = createAdapter(judgeConfig); - if (adapter.defaultBaseUrl) { - try { hosts.add(new URL(adapter.defaultBaseUrl).hostname); } catch { /* skip malformed */ } - } - } else if (judgeConfig.secret.baseUrl) { + // 1. Agent API endpoint — secret.baseUrl is always populated by validation + // (filled from adapter defaults for known agents). Same source whether + // the secret resolves to an API key or an OAuth token. + if (judgeConfig.secret.baseUrl) { try { hosts.add(new URL(judgeConfig.secret.baseUrl).hostname); } catch { /* skip malformed */ } } diff --git a/src/types.ts b/src/types.ts index 7b92ef4..1621ace 100644 --- a/src/types.ts +++ b/src/types.ts @@ -111,23 +111,23 @@ export interface AgentConfig { /** Agent config for sandboxed execution (executor/judge). * - * Auth mode is auto-detected from the resolved `secret.value`: + * Both auth modes flow through microsandbox `Secret.env()` TLS substitution — + * the cleartext credential never enters the VM. Inside the sandbox the env + * var contains a `$MSB_` placeholder; microsandbox swaps it for the + * real value on outbound TLS to the allowed host only. * - * - API keys (anything not matching the OAuth prefix) are TLS-injected by - * microsandbox — cleartext never enters the VM; the env var inside the - * sandbox contains a placeholder substituted on the wire only for the - * agent's allowed host. - * - Claude Code subscription OAuth tokens (prefix `sk-ant-oat`, e.g. - * `sk-ant-oat01-…`, issued by - * `claude setup-token`) are injected as a plain `CLAUDE_CODE_OAUTH_TOKEN` - * env var. Claude Code reads the token directly from `process.env`, so the - * TLS-substitution model does not apply. Avoids per-token API billing on - * Pro / Max / Team / Enterprise plans. + * The resolved `secret.value`'s prefix picks which env var name carries the + * placeholder: + * + * - `sk-ant-oat…` (Claude Code subscription OAuth token, issued by + * `claude setup-token`, requires Pro / Max / Team / Enterprise) → + * `CLAUDE_CODE_OAUTH_TOKEN`. Avoids per-token API billing. + * - anything else (API keys for known agents, custom-agent secrets) → + * `secret.envVar` (= `ANTHROPIC_API_KEY` for claude, etc.). * * Point `secret.value` at the host env var that holds the credential — * `$ANTHROPIC_API_KEY` for the API-key path, `$CLAUDE_CODE_OAUTH_TOKEN` for - * the subscription path. The runtime sniffs the resolved value to pick the - * path. + * the subscription path. */ export interface SandboxAgentConfig extends AgentConfig { /** Agent's secret and base URL. Auth mode is determined from the resolved value's prefix. */ From 13ccece01558ee4911a315a3c19c2db27175ee9e Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 11:28:12 +1200 Subject: [PATCH 5/6] docs: scope OAuth-vs-API-key prefix detection to the claude adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The schema reference previously read like the `sk-ant-oat…` / `sk-ant-api…` prefix dispatch was a generic feature across all adapters. In reality it's a claude-only fork — codex, gemini, and custom agents only have the API-key path today. Reframe the SandboxAgentConfig description so the default behavior leads (TLS substitution into the adapter-default env var) and the OAuth slot is clearly tagged as the claude-specific opt-in. --- skills/_reference/config-schema.md | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/skills/_reference/config-schema.md b/skills/_reference/config-schema.md index 79af65b..276784f 100644 --- a/skills/_reference/config-schema.md +++ b/skills/_reference/config-schema.md @@ -82,14 +82,11 @@ Extends AgentConfig with one **required** field: |-------|------|----------| | `secret` | `AgentSecretConfig` | **Yes** | -Both auth modes flow through microsandbox `Secret.env()` TLS substitution — the cleartext credential never enters the VM. Inside the sandbox the env var contains a `$MSB_` placeholder; microsandbox swaps it for the real value on outbound TLS to the allowed host only. +The resolved `secret.value` is wired into the sandbox via microsandbox `Secret.env()` TLS substitution — the cleartext credential never enters the VM. Inside the sandbox the env var contains a `$MSB_` placeholder; microsandbox swaps it for the real value on outbound TLS to the allowed host only. -The resolved `secret.value`'s prefix picks which env var name carries the placeholder: +By default the placeholder lands under the adapter's API-key env var (e.g. `ANTHROPIC_API_KEY` for claude, see [Known Agent Defaults](#known-agent-defaults-auto-filled-when-field-is-absent) below). -- `sk-ant-api…` (Anthropic API key, e.g. `sk-ant-api03-…`) → `ANTHROPIC_API_KEY`. -- `sk-ant-oat…` (Claude Code subscription OAuth token, e.g. `sk-ant-oat01-…`, issued by `claude setup-token`) → `CLAUDE_CODE_OAUTH_TOKEN`. Avoids per-token API billing on Pro / Max / Team / Enterprise plans. - -Point `secret.value` at the host env var that holds the credential — `"$ANTHROPIC_API_KEY"` for the API-key path, `"$CLAUDE_CODE_OAUTH_TOKEN"` for the subscription path. +**Claude-only: subscription auth.** When `command: "claude"` and the resolved value starts with `sk-ant-oat` (a Claude Code subscription OAuth token issued by `claude setup-token`, e.g. `sk-ant-oat01-…`), the placeholder lands under `CLAUDE_CODE_OAUTH_TOKEN` instead. This lets you bill the run against a Pro / Max / Team / Enterprise plan instead of per-token API charges. Point `secret.value` at `"$CLAUDE_CODE_OAUTH_TOKEN"` to opt in. Other adapters (codex, gemini, custom) only have the API-key path today. ### AgentSecretConfig From 8575a876b14decf40e5aa76b056b147061e33c35 Mon Sep 17 00:00:00 2001 From: nickwinder Date: Fri, 15 May 2026 11:33:24 +1200 Subject: [PATCH 6/6] =?UTF-8?q?refactor(auth):=20simplify=20applyAgentAuth?= =?UTF-8?q?=20=E2=80=94=20reuse=20AgentAdapter,=20drop=20reflection=20in?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code review on the PR surfaced four cleanups, all in the diff and none changing runtime behaviour: - `applyAgentAuth` accepted a local `AgentAuthAdapter` interface that duplicated three fields of the exported `AgentAdapter`. Switched to `Pick` so the shape stays tied to the real adapter type and unused fields (`defaultBaseUrl`) drop out. - Added a paired `OAUTH_TOKEN_ENV_VAR = 'CLAUDE_CODE_OAUTH_TOKEN'` constant next to `OAUTH_TOKEN_PREFIX` so the two Anthropic-specific strings live together; replaced the inline literal. - The `applyAgentAuth` tests previously inspected the opaque `SecretEntry` shape through a three-field reflection helper — fragile if the microsandbox SDK ever renames an internal field. Tests now assert on `Secret.env`'s call args (already mocked in this file), so we verify the wire contract rather than the library's internal representation. - Dropped dead `isOAuthSecret: vi.fn()` mock entries in `execute.test.ts` and `judge.test.ts` (the function was unexported in an earlier refactor commit; the mocks were leftovers). - Trimmed a verbose comment in `buildJudgeAllowlist` that restated what the surrounding `if` already expressed. 328 tests pass; type-check + lint clean. No runtime behaviour change, so no re-smoke needed. --- src/commands/__tests__/execute.test.ts | 1 - src/sandbox/__tests__/microsandbox.test.ts | 43 ++++++++++------------ src/sandbox/microsandbox.ts | 24 +++++------- src/scoring/__tests__/judge.test.ts | 1 - src/scoring/judge.ts | 4 +- 5 files changed, 29 insertions(+), 44 deletions(-) diff --git a/src/commands/__tests__/execute.test.ts b/src/commands/__tests__/execute.test.ts index e600125..cd9a570 100644 --- a/src/commands/__tests__/execute.test.ts +++ b/src/commands/__tests__/execute.test.ts @@ -42,7 +42,6 @@ vi.mock('../../sandbox/microsandbox.js', () => { MicrosandboxClient: MockMicrosandboxClient, buildSecrets: vi.fn().mockReturnValue([]), applyAgentAuth: vi.fn(), - isOAuthSecret: vi.fn().mockReturnValue(false), resolveEnv: vi.fn().mockReturnValue({}), }; }); diff --git a/src/sandbox/__tests__/microsandbox.test.ts b/src/sandbox/__tests__/microsandbox.test.ts index 50346bf..fe823b4 100644 --- a/src/sandbox/__tests__/microsandbox.test.ts +++ b/src/sandbox/__tests__/microsandbox.test.ts @@ -1,5 +1,4 @@ import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import type { SecretEntry } from 'microsandbox'; import { MicrosandboxClient, buildSecrets, resolveEnv, applyAgentAuth } from '../microsandbox.js'; // ── Mocks ──────────────────────────────────────────────────────────────────── @@ -335,6 +334,10 @@ describe('applyAgentAuth', () => { const ORIGINAL_API_KEY = process.env.ANTHROPIC_API_KEY; const ORIGINAL_OAUTH = process.env.CLAUDE_CODE_OAUTH_TOKEN; + beforeEach(() => { + vi.clearAllMocks(); + }); + afterEach(() => { const restore = (key: string, value: string | undefined) => { if (value === undefined) delete process.env[key]; @@ -346,51 +349,43 @@ describe('applyAgentAuth', () => { const claudeAdapter = { baseUrlEnvVar: 'ANTHROPIC_BASE_URL', - defaultBaseUrl: 'https://api.anthropic.com', additionalAllowHosts: [], }; - // The microsandbox `SecretEntry` is opaque, but inspecting its keys gives us - // enough confidence that the right env var name is being TLS-substituted. - const secretEnvVarName = (entry: SecretEntry): string | undefined => - (entry as { envVar?: string; env_var?: string; name?: string }).envVar - ?? (entry as { env_var?: string }).env_var - ?? (entry as { name?: string }).name; - - it('routes an OAuth-prefixed value through Secret.env under CLAUDE_CODE_OAUTH_TOKEN', () => { + it('routes an OAuth-prefixed value through Secret.env under CLAUDE_CODE_OAUTH_TOKEN', async () => { + const { Secret } = await import('microsandbox'); process.env.CLAUDE_CODE_OAUTH_TOKEN = 'sk-ant-oat01-fake-test-token'; - const secrets: SecretEntry[] = []; - const env: Record = {}; applyAgentAuth({ envVar: 'CLAUDE_CODE_OAUTH_TOKEN', value: '$CLAUDE_CODE_OAUTH_TOKEN', baseUrl: 'https://api.anthropic.com', - }, claudeAdapter, secrets, env); - expect(secrets).toHaveLength(1); - expect(secretEnvVarName(secrets[0])).toBe('CLAUDE_CODE_OAUTH_TOKEN'); - expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); + }, claudeAdapter, [], {}); + expect(Secret.env).toHaveBeenCalledWith('CLAUDE_CODE_OAUTH_TOKEN', expect.objectContaining({ + value: 'sk-ant-oat01-fake-test-token', + allowHosts: ['api.anthropic.com'], + })); }); - it('routes an API-key value through Secret.env under the agent-specific env var', () => { + it('routes an API-key value through Secret.env under the agent-specific env var', async () => { + const { Secret } = await import('microsandbox'); process.env.ANTHROPIC_API_KEY = 'sk-ant-api03-fake-test-key'; - const secrets: SecretEntry[] = []; const env: Record = {}; applyAgentAuth({ envVar: 'ANTHROPIC_API_KEY', value: '$ANTHROPIC_API_KEY', baseUrl: 'https://api.anthropic.com', baseUrlEnvVar: 'ANTHROPIC_BASE_URL', - }, claudeAdapter, secrets, env); - expect(secrets).toHaveLength(1); - expect(secretEnvVarName(secrets[0])).toBe('ANTHROPIC_API_KEY'); + }, claudeAdapter, [], env); + expect(Secret.env).toHaveBeenCalledWith('ANTHROPIC_API_KEY', expect.objectContaining({ + value: 'sk-ant-api03-fake-test-key', + allowHosts: ['api.anthropic.com'], + })); expect(env.ANTHROPIC_BASE_URL).toBe('https://api.anthropic.com'); }); it('throws when envVar or baseUrl is missing', () => { - const secrets: SecretEntry[] = []; - const env: Record = {}; expect(() => applyAgentAuth({ value: 'literal-value', - } as never, claudeAdapter, secrets, env)).toThrow(/envVar and baseUrl/); + } as never, claudeAdapter, [], {})).toThrow(/envVar and baseUrl/); }); }); \ No newline at end of file diff --git a/src/sandbox/microsandbox.ts b/src/sandbox/microsandbox.ts index f661894..b8e6434 100644 --- a/src/sandbox/microsandbox.ts +++ b/src/sandbox/microsandbox.ts @@ -5,6 +5,7 @@ import type { FsEntry, } from 'microsandbox'; import type { SandboxConfig, SecretConfig, AgentSecretConfig } from '../types.js'; +import type { AgentAdapter } from '../agents/adapter.js'; export interface CommandResult { stdout: string; @@ -49,20 +50,13 @@ export function resolveEnv( return resolved; } -/** - * Claude Code subscription OAuth tokens are prefixed `sk-ant-oat` followed by a - * version number (e.g. `sk-ant-oat01-…`), issued by `claude setup-token`. API - * keys use `sk-ant-api` (e.g. `sk-ant-api03-…`). The auth mode is determined - * by inspecting the resolved secret value at sandbox-create time — no separate - * config flag needed. - */ +// Claude-specific credential format. Subscription OAuth tokens are prefixed +// `sk-ant-oat` followed by a version (e.g. `sk-ant-oat01-…`), issued by +// `claude setup-token`. API keys use `sk-ant-api`. The framework picks the +// env-var slot the placeholder lands under by inspecting the resolved +// value's prefix — no separate config flag needed. const OAUTH_TOKEN_PREFIX = 'sk-ant-oat'; - -interface AgentAuthAdapter { - baseUrlEnvVar: string | null; - defaultBaseUrl: string | null; - additionalAllowHosts: string[]; -} +const OAUTH_TOKEN_ENV_VAR = 'CLAUDE_CODE_OAUTH_TOKEN'; /** * Wire an agent's secret into the sandbox `secrets` and `env`. @@ -83,7 +77,7 @@ interface AgentAuthAdapter { */ export function applyAgentAuth( secret: AgentSecretConfig, - adapter: AgentAuthAdapter, + adapter: Pick, secrets: SecretEntry[], env: Record, ): void { @@ -93,7 +87,7 @@ export function applyAgentAuth( const value = resolveValue(secret.value, secret.envVar); const envVar = value.startsWith(OAUTH_TOKEN_PREFIX) - ? 'CLAUDE_CODE_OAUTH_TOKEN' + ? OAUTH_TOKEN_ENV_VAR : secret.envVar; const hostname = new URL(secret.baseUrl).hostname; diff --git a/src/scoring/__tests__/judge.test.ts b/src/scoring/__tests__/judge.test.ts index ad13b8b..c8952c6 100644 --- a/src/scoring/__tests__/judge.test.ts +++ b/src/scoring/__tests__/judge.test.ts @@ -24,7 +24,6 @@ vi.mock('../../sandbox/microsandbox.js', () => ({ }), buildSecrets: vi.fn().mockReturnValue([]), applyAgentAuth: vi.fn(), - isOAuthSecret: vi.fn().mockReturnValue(false), resolveEnv: vi.fn().mockReturnValue({}), })); diff --git a/src/scoring/judge.ts b/src/scoring/judge.ts index 7650154..fe2c42d 100644 --- a/src/scoring/judge.ts +++ b/src/scoring/judge.ts @@ -135,9 +135,7 @@ const INFRA_ALLOWLIST = [ export function buildJudgeAllowlist(judgeConfig: SandboxAgentConfig, config: Config): string[] { const hosts = new Set(); - // 1. Agent API endpoint — secret.baseUrl is always populated by validation - // (filled from adapter defaults for known agents). Same source whether - // the secret resolves to an API key or an OAuth token. + // 1. Agent API endpoint from secret.baseUrl if (judgeConfig.secret.baseUrl) { try { hosts.add(new URL(judgeConfig.secret.baseUrl).hostname); } catch { /* skip malformed */ } }