Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions src/orchestrator/batch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ export interface CompareOptions {
judgeModel: string
answeringModel: string
sampling?: SamplingConfig
questionIds?: string[]
force?: boolean
}

Expand Down Expand Up @@ -146,7 +147,7 @@ export class BatchManager {
}

async createManifest(options: CompareOptions): Promise<CompareManifest> {
const { providers, benchmark, judgeModel, answeringModel, sampling } = options
const { providers, benchmark, judgeModel, answeringModel, sampling, questionIds } = options
const compareId = generateCompareId()

logger.info(`Loading benchmark: ${benchmark}`)
Expand All @@ -155,7 +156,37 @@ export class BatchManager {
const allQuestions = benchmarkInstance.getQuestions()

let targetQuestionIds: string[]
if (sampling) {
if (questionIds && questionIds.length > 0) {
// Validate that all provided IDs exist in the benchmark
const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId))
const validIds: string[] = []
const invalidIds: string[] = []

for (const id of questionIds) {
if (allQuestionIdsSet.has(id)) {
validIds.push(id)
} else {
invalidIds.push(id)
}
}

if (invalidIds.length > 0) {
logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`)
}

if (validIds.length === 0) {
throw new Error(
`All provided questionIds are invalid. No matching questions found in benchmark "${benchmark}". ` +
`Invalid IDs: ${invalidIds.join(", ")}`
)
}

targetQuestionIds = validIds
logger.info(
`Using explicit questionIds: ${validIds.length} valid questions` +
(invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "")
)
} else if (sampling) {
targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling)
} else {
targetQuestionIds = allQuestions.map((q) => q.questionId)
Expand Down
31 changes: 29 additions & 2 deletions src/orchestrator/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,35 @@ export class Orchestrator {
effectiveLimit = limit

if (questionIds && questionIds.length > 0) {
logger.info(`Using explicit questionIds: ${questionIds.length} questions`)
targetQuestionIds = questionIds
// Validate that all provided IDs exist in the benchmark
const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId))
const validIds: string[] = []
const invalidIds: string[] = []

for (const id of questionIds) {
if (allQuestionIdsSet.has(id)) {
validIds.push(id)
} else {
invalidIds.push(id)
}
}

if (invalidIds.length > 0) {
logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`)
}

if (validIds.length === 0) {
throw new Error(
`All provided questionIds are invalid. No matching questions found in benchmark "${benchmarkName}". ` +
`Invalid IDs: ${invalidIds.join(", ")}`
)
}

targetQuestionIds = validIds
logger.info(
`Using explicit questionIds: ${validIds.length} valid questions` +
(invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "")
)
} else if (sampling) {
logger.info(`Using sampling mode: ${sampling.mode}`)
targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling)
Expand Down
69 changes: 69 additions & 0 deletions src/server/routes/benchmarks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,75 @@ export async function handleBenchmarksRoutes(req: Request, url: URL): Promise<Re
}
}

// POST /api/benchmarks/:name/expand-ids - Expand conversation/session patterns to question IDs
const expandIdsMatch = pathname.match(/^\/api\/benchmarks\/([^/]+)\/expand-ids$/)
if (method === "POST" && expandIdsMatch) {
const benchmarkName = expandIdsMatch[1]

try {
const body = await req.json()
const { patterns } = body as { patterns: string[] }

if (!patterns || !Array.isArray(patterns)) {
return json({ error: "patterns array is required" }, 400)
}

const benchmark = createBenchmark(benchmarkName as any)
await benchmark.load()
const allQuestions = benchmark.getQuestions()

const expandedIds = new Set<string>()
const patternResults: Record<string, string[]> = {}

for (const pattern of patterns) {
const trimmed = pattern.trim()
if (!trimmed) continue

const expanded: string[] = []

// Pattern 1: Conversation ID (e.g., "conv-26") - expand to all questions
// Check if pattern ends with a number and doesn't have -q or -session suffix
if (/^[a-zA-Z]+-\d+$/.test(trimmed)) {
const matchingQuestions = allQuestions.filter((q) =>
q.questionId.startsWith(trimmed + "-q")
)
matchingQuestions.forEach((q) => {
expanded.push(q.questionId)
expandedIds.add(q.questionId)
})
}
// Pattern 2: Session ID (e.g., "conv-26-session_1" or "001be529-session-0")
// Find all questions that reference this session
else if (trimmed.includes("-session")) {
const matchingQuestions = allQuestions.filter((q) =>
q.haystackSessionIds.includes(trimmed)
)
matchingQuestions.forEach((q) => {
expanded.push(q.questionId)
expandedIds.add(q.questionId)
})
}
// Pattern 3: Direct question ID - add as-is if it exists
else {
const exactMatch = allQuestions.find((q) => q.questionId === trimmed)
if (exactMatch) {
expanded.push(trimmed)
expandedIds.add(trimmed)
}
}

patternResults[pattern] = expanded
}

return json({
expandedIds: Array.from(expandedIds),
patternResults,
})
} catch (e) {
return json({ error: e instanceof Error ? e.message : "Failed to expand IDs" }, 400)
}
}

// GET /api/models - List available models
if (method === "GET" && pathname === "/api/models") {
const openai = listModelsByProvider("openai").map((alias) => ({
Expand Down
5 changes: 4 additions & 1 deletion src/server/routes/compare.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,8 @@ export async function handleCompareRoutes(req: Request, url: URL): Promise<Respo
if (method === "POST" && pathname === "/api/compare/start") {
try {
const body = await req.json()
const { providers, benchmark, judgeModel, answeringModel, sampling, force } = body
const { providers, benchmark, judgeModel, answeringModel, sampling, questionIds, force } =
body

if (!providers || !Array.isArray(providers) || providers.length === 0) {
return json({ error: "Missing or invalid providers array" }, 400)
Expand All @@ -165,6 +166,7 @@ export async function handleCompareRoutes(req: Request, url: URL): Promise<Respo
judgeModel,
answeringModel,
sampling,
questionIds,
force,
})

Expand Down Expand Up @@ -387,6 +389,7 @@ async function initializeComparison(options: {
judgeModel: string
answeringModel: string
sampling?: SamplingConfig
questionIds?: string[]
force?: boolean
}): Promise<{ compareId: string }> {
// Only await manifest creation - this is fast
Expand Down
5 changes: 5 additions & 0 deletions src/server/routes/runs.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,14 @@ export async function handleRunsRoutes(req: Request, url: URL): Promise<Response
answeringModel,
limit,
sampling,
questionIds,
concurrency,
force,
fromPhase,
sourceRunId,
} = body
console.log("[API] Extracted sampling:", sampling)
console.log("[API] Extracted questionIds:", questionIds)
console.log("[API] Extracted concurrency:", concurrency)

if (!provider || !benchmark || !runId || !judgeModel) {
Expand Down Expand Up @@ -279,6 +281,7 @@ export async function handleRunsRoutes(req: Request, url: URL): Promise<Response
answeringModel,
limit,
sampling,
questionIds,
concurrency,
force: sourceRunId ? false : force,
fromPhase: fromPhase as PhaseId | undefined,
Expand Down Expand Up @@ -374,6 +377,7 @@ async function runBenchmark(options: {
answeringModel?: string
limit?: number
sampling?: SamplingConfig
questionIds?: string[]
concurrency?: ConcurrencyConfig
force?: boolean
fromPhase?: PhaseId
Expand All @@ -396,6 +400,7 @@ async function runBenchmark(options: {
answeringModel: options.answeringModel,
limit: options.limit,
sampling: options.sampling,
questionIds: options.questionIds,
concurrency: options.concurrency,
force: options.force,
phases,
Expand Down
Loading