Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions command-snapshot.json
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@
"alias": [],
"command": "agent:test:run-eval",
"flagAliases": [],
"flagChars": ["n", "o", "s", "w"],
"flagChars": ["n", "o", "s"],
"flags": [
"api-name",
"api-version",
Expand All @@ -238,8 +238,7 @@
"no-normalize",
"result-format",
"spec",
"target-org",
"wait"
"target-org"
],
"plugin": "@salesforce/plugin-agent"
},
Expand Down
4 changes: 0 additions & 4 deletions messages/agent.test.run-eval.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ Path to test spec file (YAML or JSON). Supports reading from stdin when piping c

Agent DeveloperName (also called API name) to resolve agent_id and agent_version_id. Auto-inferred from the YAML spec's subjectName.

# flags.wait.summary

Number of minutes to wait for results.

# flags.result-format.summary

Format of the agent test results.
Expand Down
260 changes: 72 additions & 188 deletions src/commands/agent/test/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,22 @@

import { readFile } from 'node:fs/promises';
import { Flags, SfCommand, toHelpSection } from '@salesforce/sf-plugins-core';
import { EnvironmentVariable, Messages, Org, SfError } from '@salesforce/core';
import { type EvalPayload, normalizePayload, splitIntoBatches } from '../../../evalNormalizer.js';
import { type EvalApiResponse, formatResults, type ResultFormat } from '../../../evalFormatter.js';
import { EnvironmentVariable, Messages, SfError } from '@salesforce/core';
import {
type EvalPayload,
normalizePayload,
splitIntoBatches,
type EvalApiResponse,
formatResults,
type ResultFormat,
isYamlTestSpec,
parseTestSpec,
translateTestSpec,
resolveAgent,
executeBatches,
buildResultSummary,
} from '@salesforce/agents';
import { resultFormatFlag } from '../../../flags.js';
import { isYamlTestSpec, parseTestSpec, translateTestSpec } from '../../../yamlSpecTranslator.js';

Messages.importMessagesDirectoryFromMetaUrl(import.meta.url);
const messages = Messages.loadMessages('@salesforce/plugin-agent', 'agent.test.run-eval');
Expand All @@ -30,132 +41,35 @@ export type RunEvalResult = {
summary: { passed: number; failed: number; scored: number; errors: number };
};

// --- Standalone helper functions ---

type ApiHeaders = {
orgId: string;
userId: string;
instanceUrl: string;
};

async function getApiHeaders(org: Org): Promise<ApiHeaders> {
const conn = org.getConnection();
const userInfo = await conn.request<{ user_id: string }>(`${conn.instanceUrl}/services/oauth2/userinfo`);

return {
orgId: org.getOrgId(),
userId: userInfo.user_id,
instanceUrl: conn.instanceUrl,
};
}

async function callEvalApi(org: Org, payload: EvalPayload, headers: ApiHeaders): Promise<{ results?: unknown[] }> {
const conn = org.getConnection();

return conn.request<{ results?: unknown[] }>({
url: 'https://api.salesforce.com/einstein/evaluation/v1/tests',
method: 'POST',
headers: {
'Content-Type': 'application/json',
'x-sfdc-core-tenant-id': `core/prod/${headers.orgId}`,
'x-org-id': headers.orgId,
'x-sfdc-core-instance-url': headers.instanceUrl,
'x-sfdc-user-id': headers.userId,
'x-client-feature-id': 'AIPlatformEvaluation',
'x-sfdc-app-context': 'EinsteinGPT',
},
body: JSON.stringify(payload),
});
}

async function resolveAgent(org: Org, apiName: string): Promise<{ agentId: string; versionId: string }> {
const conn = org.getConnection();

// Escape single quotes to prevent SOQL injection
const escapedApiName = apiName.replace(/'/g, "\\'");

const botResult = await conn.query<{ Id: string }>(
`SELECT Id FROM BotDefinition WHERE DeveloperName = '${escapedApiName}'`
);
if (!botResult.records.length) {
throw messages.createError('error.agentNotFound', [apiName]);
async function resolveAndInjectAgent(
org: Parameters<typeof resolveAgent>[0],
agentApiName: string,
payload: EvalPayload
): Promise<void> {
let agentId: string;
let versionId: string;
try {
({ agentId, versionId } = await resolveAgent(org, agentApiName));
} catch (e) {
const wrapped = SfError.wrap(e);
throw new SfError(`Agent '${agentApiName}' not found.`, 'AgentNotFound', [], 2, wrapped);
}
const agentId = botResult.records[0].Id;

// Filter to published/active versions only
const versionResult = await conn.query<{ Id: string }>(
`SELECT Id FROM BotVersion WHERE BotDefinitionId = '${agentId}' ORDER BY VersionNumber DESC LIMIT 1`
);
if (!versionResult.records.length) {
throw messages.createError('error.agentVersionNotFound', [apiName]);
}
const versionId = versionResult.records[0].Id;

return { agentId, versionId };
}

async function executeBatches(
org: Org,
batches: Array<EvalPayload['tests']>,
log: (msg: string) => void
): Promise<unknown[]> {
// Pre-calculate headers once to avoid redundant API calls
const headers = await getApiHeaders(org);

// Execute all batches in parallel for better performance
if (batches.length > 1) {
log(messages.getMessage('info.batchProgress', [batches.length, batches.length, 'total']));
}

const batchPromises = batches.map(async (batch) => {
const batchPayload: EvalPayload = { tests: batch };
const resultObj = await callEvalApi(org, batchPayload, headers);
return resultObj.results ?? [];
});

const batchResults = await Promise.all(batchPromises);
return batchResults.flat();
}

function buildResultSummary(mergedResponse: EvalApiResponse): {
summary: RunEvalResult['summary'];
testSummaries: RunEvalResult['tests'];
} {
const summary = { passed: 0, failed: 0, scored: 0, errors: 0 };
const testSummaries: Array<{ id: string; status: string; evaluations: unknown[]; outputs: unknown[] }> = [];

for (const testResult of mergedResponse.results ?? []) {
const tr = testResult as Record<string, unknown>;
const testId = (tr.id as string) ?? 'unknown';
const evalResults = (tr.evaluation_results as Array<Record<string, unknown>>) ?? [];
const testErrors = (tr.errors as unknown[]) ?? [];

const passed = evalResults.filter((e) => e.is_pass === true).length;
const failed = evalResults.filter((e) => e.is_pass === false).length;
const scored = evalResults.filter((e) => e.score != null && e.is_pass == null).length;

summary.passed += passed;
summary.failed += failed;
summary.scored += scored;
summary.errors += testErrors.length;

testSummaries.push({
id: testId,
status: failed > 0 || testErrors.length > 0 ? 'failed' : 'passed',
evaluations: evalResults,
outputs: (tr.outputs as unknown[]) ?? [],
});
for (const test of payload.tests) {
for (const step of test.steps) {
if (step.type === 'agent.create_session') {
// eslint-disable-next-line camelcase
step.agent_id = agentId;
// eslint-disable-next-line camelcase
step.agent_version_id = versionId;
}
}
}

return { summary, testSummaries };
}

export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
public static readonly summary = messages.getMessage('summary');
public static readonly description = messages.getMessage('description');
public static readonly examples = messages.getMessages('examples');
public static state = 'beta';
public static readonly hidden = true;

public static readonly envVariablesSection = toHelpSection(
'ENVIRONMENT VARIABLES',
Expand All @@ -182,11 +96,6 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
char: 'n',
summary: messages.getMessage('flags.api-name.summary'),
}),
wait: Flags.integer({
char: 'w',
default: 10,
summary: messages.getMessage('flags.wait.summary'),
}),
'result-format': resultFormatFlag(),
'batch-size': Flags.integer({
default: 5,
Expand All @@ -204,41 +113,39 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {

// 1. Get spec content (from file or stdin via allowStdin)
let rawContent = flags.spec;
let isYaml = isYamlTestSpec(rawContent);

// If spec looks like it might be a file path (not parseable content), read the file
try {
// Try to detect if it's actual content vs a file path
// If it's a valid YAML/JSON, it's content; otherwise treat as file path
if (!isYamlTestSpec(rawContent)) {
JSON.parse(rawContent);
}
// If we got here, it's valid content
} catch {
// Not valid content, must be a file path - read it
if (!isYaml) {
try {
rawContent = await readFile(flags.spec, 'utf-8');
} catch (e) {
const wrapped = SfError.wrap(e);
throw new SfError(`Spec file not found: ${flags.spec}`, 'SpecFileNotFound', [], 2, wrapped);
JSON.parse(rawContent);
} catch {
try {
rawContent = await readFile(flags.spec, 'utf-8');
} catch (e) {
const wrapped = SfError.wrap(e);
throw new SfError(`Spec file not found: ${flags.spec}`, 'SpecFileNotFound', [], 2, wrapped);
}
isYaml = isYamlTestSpec(rawContent);
}
}

// 2. Detect format and parse
let payload: EvalPayload;
let payload!: EvalPayload;
let agentApiName = flags['api-name'];

if (isYamlTestSpec(rawContent)) {
// YAML TestSpec detected — translate to EvalPayload
const spec = parseTestSpec(rawContent);
payload = translateTestSpec(spec);
if (isYaml) {
try {
const spec = parseTestSpec(rawContent);
payload = translateTestSpec(spec);

// Auto-infer api-name from subjectName if not explicitly provided
if (!agentApiName) {
agentApiName = spec.subjectName;
this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()]));
if (!agentApiName) {
agentApiName = spec.subjectName;
this.log(messages.getMessage('info.yamlDetected', [spec.subjectName, spec.testCases.length.toString()]));
}
} catch (e) {
throw messages.createError('error.invalidPayload', [(e as Error).message]);
}
} else {
// JSON EvalPayload (original behavior)
try {
payload = JSON.parse(rawContent) as EvalPayload;
} catch (e) {
Expand All @@ -250,47 +157,28 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
throw messages.createError('error.invalidPayload', ['missing or empty "tests" array']);
}

// 3. If --api-name (or auto-inferred from YAML), resolve IDs and inject
if (agentApiName) {
let agentId;
let versionId;
try {
const resolved = await resolveAgent(org, agentApiName);
agentId = resolved.agentId;
versionId = resolved.versionId;
} catch (e) {
const wrapped = SfError.wrap(e);
throw new SfError(`Agent '${agentApiName}' not found.`, 'AgentNotFound', [], 2, wrapped);
for (const test of payload.tests) {
if (!Array.isArray(test.steps)) {
throw messages.createError('error.invalidPayload', [`test '${test.id}' has missing or invalid 'steps' array`]);
}
}

for (const test of payload.tests) {
for (const step of test.steps) {
if (step.type === 'agent.create_session') {
// eslint-disable-next-line camelcase
step.agent_id = agentId;
// eslint-disable-next-line camelcase
step.agent_version_id = versionId;
}
}
}
// 3. If --api-name (or auto-inferred from YAML), resolve IDs and inject
if (agentApiName) {
await resolveAndInjectAgent(org, agentApiName, payload);
}

// 4. Normalize payload unless --no-normalize
if (!flags['no-normalize']) {
payload = normalizePayload(payload);
}

// 5. Clamp batch size
// 5. Clamp batch size and split into batches
const batchSize = Math.min(Math.max(flags['batch-size'], 1), 5);

// 6. Split into batches
const batches = splitIntoBatches(payload.tests, batchSize);

// 7. Execute batches
let allResults;
try {
allResults = await executeBatches(org, batches, (msg) => this.log(msg));
} catch (e) {
// 6. Execute batches
const allResults = await executeBatches(org, batches, (msg) => this.log(msg)).catch((e) => {
const wrapped = SfError.wrap(e);
throw new SfError(
`Failed to execute tests: ${wrapped.message}`,
Expand All @@ -299,20 +187,16 @@ export default class AgentTestRunEval extends SfCommand<RunEvalResult> {
4,
wrapped
);
}
});

const mergedResponse: EvalApiResponse = { results: allResults as EvalApiResponse['results'] };

// 9. Format output
const resultFormat = (flags['result-format'] ?? 'human') as ResultFormat;
const formatted = formatResults(mergedResponse, resultFormat);
this.log(formatted);
// 7. Format output
this.log(formatResults(mergedResponse, (flags['result-format'] ?? 'human') as ResultFormat));

// 10. Build structured result for --json
// 8. Build structured result for --json
const { summary, testSummaries } = buildResultSummary(mergedResponse);

// Set exit code to 1 only for execution errors (tests couldn't run)
// Test failures (assertions failed) are business logic and should not affect exit code
if (summary.errors > 0) {
process.exitCode = 1;
}
Expand Down
Loading
Loading