supermemoryai · sreedharsreeram · Feb 17, 2026
diff --git a/src/orchestrator/batch.ts b/src/orchestrator/batch.ts
@@ -35,6 +35,7 @@ export interface CompareOptions {
   judgeModel: string
   answeringModel: string
   sampling?: SamplingConfig
+  questionIds?: string[]
   force?: boolean
 }
 
@@ -146,7 +147,7 @@ export class BatchManager {
   }
 
   async createManifest(options: CompareOptions): Promise<CompareManifest> {
-    const { providers, benchmark, judgeModel, answeringModel, sampling } = options
+    const { providers, benchmark, judgeModel, answeringModel, sampling, questionIds } = options
     const compareId = generateCompareId()
 
     logger.info(`Loading benchmark: ${benchmark}`)
@@ -155,7 +156,37 @@ export class BatchManager {
     const allQuestions = benchmarkInstance.getQuestions()
 
     let targetQuestionIds: string[]
-    if (sampling) {
+    if (questionIds && questionIds.length > 0) {
+      // Validate that all provided IDs exist in the benchmark
+      const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId))
+      const validIds: string[] = []
+      const invalidIds: string[] = []
+
+      for (const id of questionIds) {
+        if (allQuestionIdsSet.has(id)) {
+          validIds.push(id)
+        } else {
+          invalidIds.push(id)
+        }
+      }
+
+      if (invalidIds.length > 0) {
+        logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`)
+      }
+
+      if (validIds.length === 0) {
+        throw new Error(
+          `All provided questionIds are invalid. No matching questions found in benchmark "${benchmark}". ` +
+            `Invalid IDs: ${invalidIds.join(", ")}`
+        )
+      }
+
+      targetQuestionIds = validIds
+      logger.info(
+        `Using explicit questionIds: ${validIds.length} valid questions` +
+          (invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "")
+      )
+    } else if (sampling) {
       targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling)
     } else {
       targetQuestionIds = allQuestions.map((q) => q.questionId)

diff --git a/src/orchestrator/index.ts b/src/orchestrator/index.ts
@@ -213,8 +213,35 @@ export class Orchestrator {
       effectiveLimit = limit
 
       if (questionIds && questionIds.length > 0) {
-        logger.info(`Using explicit questionIds: ${questionIds.length} questions`)
-        targetQuestionIds = questionIds
+        // Validate that all provided IDs exist in the benchmark
+        const allQuestionIdsSet = new Set(allQuestions.map((q) => q.questionId))
+        const validIds: string[] = []
+        const invalidIds: string[] = []
+
+        for (const id of questionIds) {
+          if (allQuestionIdsSet.has(id)) {
+            validIds.push(id)
+          } else {
+            invalidIds.push(id)
+          }
+        }
+
+        if (invalidIds.length > 0) {
+          logger.warn(`Invalid question IDs (will be skipped): ${invalidIds.join(", ")}`)
+        }
+
+        if (validIds.length === 0) {
+          throw new Error(
+            `All provided questionIds are invalid. No matching questions found in benchmark "${benchmarkName}". ` +
+              `Invalid IDs: ${invalidIds.join(", ")}`
+          )
+        }
+
+        targetQuestionIds = validIds
+        logger.info(
+          `Using explicit questionIds: ${validIds.length} valid questions` +
+            (invalidIds.length > 0 ? ` (${invalidIds.length} invalid skipped)` : "")
+        )
       } else if (sampling) {
         logger.info(`Using sampling mode: ${sampling.mode}`)
         targetQuestionIds = selectQuestionsBySampling(allQuestions, sampling)

diff --git a/src/server/routes/benchmarks.ts b/src/server/routes/benchmarks.ts
@@ -128,6 +128,75 @@ export async function handleBenchmarksRoutes(req: Request, url: URL): Promise<Re
     }
   }
 
+  // POST /api/benchmarks/:name/expand-ids - Expand conversation/session patterns to question IDs
+  const expandIdsMatch = pathname.match(/^\/api\/benchmarks\/([^/]+)\/expand-ids$/)
+  if (method === "POST" && expandIdsMatch) {
+    const benchmarkName = expandIdsMatch[1]
+
+    try {
+      const body = await req.json()
+      const { patterns } = body as { patterns: string[] }
+
+      if (!patterns || !Array.isArray(patterns)) {
+        return json({ error: "patterns array is required" }, 400)
+      }
+
+      const benchmark = createBenchmark(benchmarkName as any)
+      await benchmark.load()
+      const allQuestions = benchmark.getQuestions()
+
+      const expandedIds = new Set<string>()
+      const patternResults: Record<string, string[]> = {}
+
+      for (const pattern of patterns) {
+        const trimmed = pattern.trim()
+        if (!trimmed) continue
+
+        const expanded: string[] = []
+
+        // Pattern 1: Conversation ID (e.g., "conv-26") - expand to all questions
+        // Check if pattern ends with a number and doesn't have -q or -session suffix
+        if (/^[a-zA-Z]+-\d+$/.test(trimmed)) {
+          const matchingQuestions = allQuestions.filter((q) =>
+            q.questionId.startsWith(trimmed + "-q")
+          )
+          matchingQuestions.forEach((q) => {
+            expanded.push(q.questionId)
+            expandedIds.add(q.questionId)
+          })
+        }
+        // Pattern 2: Session ID (e.g., "conv-26-session_1" or "001be529-session-0")
+        // Find all questions that reference this session
+        else if (trimmed.includes("-session")) {
+          const matchingQuestions = allQuestions.filter((q) =>
+            q.haystackSessionIds.includes(trimmed)
+          )
+          matchingQuestions.forEach((q) => {
+            expanded.push(q.questionId)
+            expandedIds.add(q.questionId)
+          })
+        }
+        // Pattern 3: Direct question ID - add as-is if it exists
+        else {
+          const exactMatch = allQuestions.find((q) => q.questionId === trimmed)
+          if (exactMatch) {
+            expanded.push(trimmed)
+            expandedIds.add(trimmed)
+          }
+        }
+
+        patternResults[pattern] = expanded
+      }
+
+      return json({
+        expandedIds: Array.from(expandedIds),
+        patternResults,
+      })
+    } catch (e) {
+      return json({ error: e instanceof Error ? e.message : "Failed to expand IDs" }, 400)
+    }
+  }
+
   // GET /api/models - List available models
   if (method === "GET" && pathname === "/api/models") {
     const openai = listModelsByProvider("openai").map((alias) => ({

diff --git a/src/server/routes/compare.ts b/src/server/routes/compare.ts
@@ -146,7 +146,8 @@ export async function handleCompareRoutes(req: Request, url: URL): Promise<Respo
   if (method === "POST" && pathname === "/api/compare/start") {
     try {
       const body = await req.json()
-      const { providers, benchmark, judgeModel, answeringModel, sampling, force } = body
+      const { providers, benchmark, judgeModel, answeringModel, sampling, questionIds, force } =
+        body
 
       if (!providers || !Array.isArray(providers) || providers.length === 0) {
         return json({ error: "Missing or invalid providers array" }, 400)
@@ -165,6 +166,7 @@ export async function handleCompareRoutes(req: Request, url: URL): Promise<Respo
         judgeModel,
         answeringModel,
         sampling,
+        questionIds,
         force,
       })
 
@@ -387,6 +389,7 @@ async function initializeComparison(options: {
   judgeModel: string
   answeringModel: string
   sampling?: SamplingConfig
+  questionIds?: string[]
   force?: boolean
 }): Promise<{ compareId: string }> {
   // Only await manifest creation - this is fast

diff --git a/src/server/routes/runs.ts b/src/server/routes/runs.ts
@@ -190,12 +190,14 @@ export async function handleRunsRoutes(req: Request, url: URL): Promise<Response
         answeringModel,
         limit,
         sampling,
+        questionIds,
         concurrency,
         force,
         fromPhase,
         sourceRunId,
       } = body
       console.log("[API] Extracted sampling:", sampling)
+      console.log("[API] Extracted questionIds:", questionIds)
       console.log("[API] Extracted concurrency:", concurrency)
 
       if (!provider || !benchmark || !runId || !judgeModel) {
@@ -279,6 +281,7 @@ export async function handleRunsRoutes(req: Request, url: URL): Promise<Response
         answeringModel,
         limit,
         sampling,
+        questionIds,
         concurrency,
         force: sourceRunId ? false : force,
         fromPhase: fromPhase as PhaseId | undefined,
@@ -374,6 +377,7 @@ async function runBenchmark(options: {
   answeringModel?: string
   limit?: number
   sampling?: SamplingConfig
+  questionIds?: string[]
   concurrency?: ConcurrencyConfig
   force?: boolean
   fromPhase?: PhaseId
@@ -396,6 +400,7 @@ async function runBenchmark(options: {
       answeringModel: options.answeringModel,
       limit: options.limit,
       sampling: options.sampling,
+      questionIds: options.questionIds,
       concurrency: options.concurrency,
       force: options.force,
       phases,