From 2346e429a06b8d14ffa82bfe6da624906f505522 Mon Sep 17 00:00:00 2001
From: Sreeram Sreedhar <ssreedh9@asu.edu>
Date: Fri, 22 May 2026 22:26:06 -0400
Subject: [PATCH 1/4] initial commit

---
 src/benchmarks/beam/index.ts | 273 +++++++++++++++++++++++++++++++++++
 src/benchmarks/beam/types.ts |  31 ++++
 src/benchmarks/index.ts      |   6 +-
 src/types/benchmark.ts       |   3 +-
 4 files changed, 311 insertions(+), 2 deletions(-)
 create mode 100644 src/benchmarks/beam/index.ts
 create mode 100644 src/benchmarks/beam/types.ts
diff --git a/src/benchmarks/beam/index.ts b/src/benchmarks/beam/index.ts
new file mode 100644
index 0000000..3af5a93
--- /dev/null
+++ b/src/benchmarks/beam/index.ts
@@ -0,0 +1,273 @@
+import { existsSync, readFileSync, readdirSync } from "fs"
+import { join } from "path"
+import type { Benchmark, BenchmarkConfig, QuestionFilter } from "../../types/benchmark"
+import type {
+  QuestionTypeRegistry,
+  UnifiedMessage,
+  UnifiedQuestion,
+  UnifiedSession,
+} from "../../types/unified"
+import { logger } from "../../utils/logger"
+import type { BeamBatch, BeamChatFile, BeamProbingQuestionsFile, BeamScale } from "./types"
+
+const DEFAULT_DATA_PATH = "./data/benchmarks/beam/chats"
+
+export const BEAM_QUESTION_TYPES: QuestionTypeRegistry = {
+  abstention: {
+    id: "abstention",
+    alias: "abstain",
+    description: "Withhold answers when evidence is missing",
+  },
+  contradiction_resolution: {
+    id: "contradiction_resolution",
+    alias: "contradict",
+    description: "Detect and reconcile inconsistent statements",
+  },
+  event_ordering: {
+    id: "event_ordering",
+    alias: "order",
+    description: "Reconstruct event or information order",
+  },
+  information_extraction: {
+    id: "information_extraction",
+    alias: "extract",
+    description: "Recall entities and factual details",
+  },
+  instruction_following: {
+    id: "instruction_following",
+    alias: "instruction",
+    description: "Follow sustained user instructions",
+  },
+  knowledge_update: {
+    id: "knowledge_update",
+    alias: "update",
+    description: "Retain updated facts over stale facts",
+  },
+  multi_session_reasoning: {
+    id: "multi_session_reasoning",
+    alias: "multi",
+    description: "Reason across non-adjacent dialogue segments",
+  },
+  preference_following: {
+    id: "preference_following",
+    alias: "preference",
+    description: "Adapt to evolving user preferences",
+  },
+  summarization: {
+    id: "summarization",
+    alias: "summary",
+    description: "Summarize dialogue content",
+  },
+  temporal_reasoning: {
+    id: "temporal_reasoning",
+    alias: "temporal",
+    description: "Reason about explicit and implicit time relations",
+  },
+}
+
+function flattenChatFile(chatFile: BeamChatFile): BeamBatch[] {
+  if (Array.isArray(chatFile)) {
+    return chatFile.flatMap((entry) => {
+      if ("turns" in entry) return [entry]
+      return flattenChatFile(entry)
+    })
+  }
+
+  return Object.keys(chatFile)
+    .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
+    .flatMap((key) => chatFile[key] || [])
+}
+
+function createGroundTruth(question: unknown): string {
+  return JSON.stringify(question)
+}
+
+function getQuestionAnswer(question: Record<string, unknown>): string | undefined {
+  const answer =
+    question.answer || question.ideal_answer || question.ideal_response || question.ideal_summary
+  return typeof answer === "string" ? answer : undefined
+}
+
+export class BeamBenchmark implements Benchmark {
+  name: string
+  private scales: BeamScale[]
+  private questions: UnifiedQuestion[] = []
+  private sessionsMap: Map<string, UnifiedSession[]> = new Map()
+  private ingestionGroupMap: Map<string, string> = new Map()
+  private dataPath: string = ""
+
+  constructor(scales: BeamScale[] = ["1M", "10M"], name = "beam") {
+    this.scales = scales
+    this.name = name
+  }
+
+  async load(config?: BenchmarkConfig): Promise<void> {
+    this.dataPath = config?.dataPath || DEFAULT_DATA_PATH
+    const fullPath = join(process.cwd(), this.dataPath)
+
+    if (!existsSync(fullPath)) {
+      throw new Error(
+        `BEAM dataset not found at ${fullPath}. Expected chats under ${DEFAULT_DATA_PATH}/{1M,10M}.`
+      )
+    }
+
+    for (const scale of this.scales) {
+      this.loadScale(fullPath, scale)
+    }
+
+    logger.info(
+      `Loaded ${this.questions.length} questions from BEAM (${this.scales.join(", ")})`
+    )
+  }
+
+  private loadScale(basePath: string, scale: BeamScale): void {
+    const scalePath = join(basePath, scale)
+    if (!existsSync(scalePath)) {
+      throw new Error(`BEAM ${scale} dataset not found at ${scalePath}`)
+    }
+
+    const chatDirs = readdirSync(scalePath, { withFileTypes: true })
+      .filter((entry) => entry.isDirectory() && /^\d+$/.test(entry.name))
+      .map((entry) => entry.name)
+      .sort((a, b) => Number(a) - Number(b))
+
+    for (const chatId of chatDirs) {
+      this.loadChat(scalePath, scale, chatId)
+    }
+  }
+
+  private loadChat(scalePath: string, scale: BeamScale, chatId: string): void {
+    const chatDir = join(scalePath, chatId)
+    const truncatedPath = join(chatDir, "chat_trunecated.json")
+    const fullChatPath = join(chatDir, "chat.json")
+    const chatPath = existsSync(truncatedPath) ? truncatedPath : fullChatPath
+    const probingPath = join(chatDir, "probing_questions", "probing_questions.json")
+
+    if (!existsSync(chatPath) || !existsSync(probingPath)) {
+      logger.warn(`Skipping BEAM ${scale}/${chatId}: missing chat or probing questions`)
+      return
+    }
+
+    const batches = flattenChatFile(JSON.parse(readFileSync(chatPath, "utf8")) as BeamChatFile)
+    const sessions = this.extractSessions(scale, chatId, batches)
+    const probingQuestions = JSON.parse(
+      readFileSync(probingPath, "utf8")
+    ) as BeamProbingQuestionsFile
+    const sessionIds = sessions.map((session) => session.sessionId)
+    const ingestionGroupId = `beam-${scale}-${chatId}`
+
+    for (const questionType of Object.keys(probingQuestions)) {
+      const questionsForType = probingQuestions[questionType] || []
+
+      for (let i = 0; i < questionsForType.length; i++) {
+        const probingQuestion = questionsForType[i]
+        const questionId = `${ingestionGroupId}-${questionType}-${i}`
+        const answer = getQuestionAnswer(probingQuestion)
+
+        this.questions.push({
+          questionId,
+          question: probingQuestion.question,
+          questionType,
+          groundTruth: createGroundTruth(probingQuestion),
+          haystackSessionIds: sessionIds,
+          metadata: {
+            scale,
+            chatId,
+            ingestionGroupId,
+            rubric: probingQuestion.rubric,
+            difficulty: probingQuestion.difficulty,
+            answer,
+          },
+        })
+
+        this.sessionsMap.set(questionId, sessions)
+        this.ingestionGroupMap.set(questionId, ingestionGroupId)
+      }
+    }
+  }
+
+  private extractSessions(scale: BeamScale, chatId: string, batches: BeamBatch[]): UnifiedSession[] {
+    const sessions: UnifiedSession[] = []
+
+    for (const batch of batches) {
+      for (let turnIndex = 0; turnIndex < batch.turns.length; turnIndex++) {
+        const turn = batch.turns[turnIndex]
+        const messages = turn
+          .filter((message) => message.content)
+          .map(
+            (message): UnifiedMessage => ({
+              role: message.role,
+              content: message.content,
+              speaker: message.role,
+              timestamp: message.time_anchor,
+            })
+          )
+
+        if (messages.length === 0) continue
+
+        sessions.push({
+          sessionId: `beam-${scale}-${chatId}-batch-${batch.batch_number}-turn-${turnIndex + 1}`,
+          messages,
+          metadata: {
+            scale,
+            chatId,
+            batchNumber: batch.batch_number,
+            turnIndex: turnIndex + 1,
+            timeAnchor: batch.time_anchor,
+          },
+        })
+      }
+    }
+
+    return sessions
+  }
+
+  getQuestions(filter?: QuestionFilter): UnifiedQuestion[] {
+    let result = [...this.questions]
+
+    if (filter?.questionTypes?.length) {
+      result = result.filter((q) => filter.questionTypes!.includes(q.questionType))
+    }
+
+    if (filter?.offset) {
+      result = result.slice(filter.offset)
+    }
+
+    if (filter?.limit) {
+      result = result.slice(0, filter.limit)
+    }
+
+    return result
+  }
+
+  getHaystackSessions(questionId: string): UnifiedSession[] {
+    return this.sessionsMap.get(questionId) || []
+  }
+
+  getGroundTruth(questionId: string): string {
+    const question = this.questions.find((q) => q.questionId === questionId)
+    return question?.groundTruth || ""
+  }
+
+  getQuestionTypes(): QuestionTypeRegistry {
+    return BEAM_QUESTION_TYPES
+  }
+
+  getIngestionGroupId(questionId: string): string {
+    return this.ingestionGroupMap.get(questionId) || questionId
+  }
+}
+
+export class Beam1MBenchmark extends BeamBenchmark {
+  constructor() {
+    super(["1M"], "beam-1m")
+  }
+}
+
+export class Beam10MBenchmark extends BeamBenchmark {
+  constructor() {
+    super(["10M"], "beam-10m")
+  }
+}
+
+export default BeamBenchmark
diff --git a/src/benchmarks/beam/types.ts b/src/benchmarks/beam/types.ts
new file mode 100644
index 0000000..522cbb0
--- /dev/null
+++ b/src/benchmarks/beam/types.ts
@@ -0,0 +1,31 @@
+export type BeamScale = "1M" | "10M"
+
+export interface BeamMessage {
+  role: "user" | "assistant"
+  id?: number
+  content: string
+  time_anchor?: string
+  index?: string
+  question_type?: string
+}
+
+export interface BeamBatch {
+  batch_number: number
+  time_anchor?: string | null
+  turns: BeamMessage[][]
+}
+
+export type BeamChatFile = BeamBatch[] | Record<string, BeamBatch[]> | Record<string, BeamBatch[]>[]
+
+export interface BeamProbingQuestion {
+  question: string
+  rubric: string[]
+  difficulty?: string
+  answer?: string
+  ideal_answer?: string
+  ideal_response?: string
+  ideal_summary?: string
+  [key: string]: unknown
+}
+
+export type BeamProbingQuestionsFile = Record<string, BeamProbingQuestion[]>
diff --git a/src/benchmarks/index.ts b/src/benchmarks/index.ts
index b790e87..56cd0c4 100644
--- a/src/benchmarks/index.ts
+++ b/src/benchmarks/index.ts
@@ -2,11 +2,15 @@ import type { Benchmark, BenchmarkName } from "../types/benchmark"
 import { LoCoMoBenchmark } from "./locomo"
 import { LongMemEvalBenchmark } from "./longmemeval"
 import { ConvoMemBenchmark } from "./convomem"
+import { Beam1MBenchmark, Beam10MBenchmark, BeamBenchmark } from "./beam"
 
 const benchmarks: Record<BenchmarkName, new () => Benchmark> = {
   locomo: LoCoMoBenchmark,
   longmemeval: LongMemEvalBenchmark,
   convomem: ConvoMemBenchmark,
+  "beam-1m": Beam1MBenchmark,
+  "beam-10m": Beam10MBenchmark,
+  beam: BeamBenchmark,
 }
 
 export function createBenchmark(name: BenchmarkName): Benchmark {
@@ -21,4 +25,4 @@ export function getAvailableBenchmarks(): BenchmarkName[] {
   return Object.keys(benchmarks) as BenchmarkName[]
 }
 
-export { LoCoMoBenchmark, LongMemEvalBenchmark, ConvoMemBenchmark }
+export { LoCoMoBenchmark, LongMemEvalBenchmark, ConvoMemBenchmark, BeamBenchmark }
diff --git a/src/types/benchmark.ts b/src/types/benchmark.ts
index 07961e4..7066f7b 100644
--- a/src/types/benchmark.ts
+++ b/src/types/benchmark.ts
@@ -18,6 +18,7 @@ export interface Benchmark {
   getHaystackSessions(questionId: string): UnifiedSession[]
   getGroundTruth(questionId: string): string
   getQuestionTypes(): QuestionTypeRegistry
+  getIngestionGroupId?(questionId: string): string
 }
 
-export type BenchmarkName = "locomo" | "longmemeval" | "convomem"
+export type BenchmarkName = "locomo" | "longmemeval" | "convomem" | "beam-1m" | "beam-10m" | "beam"

From 5955c9b47399ec7354ccc317dc0d197132060b39 Mon Sep 17 00:00:00 2001
From: Sreeram Sreedhar <ssreedh9@asu.edu>
Date: Fri, 22 May 2026 22:37:31 -0400
Subject: [PATCH 2/4] add beam benchmark integration

---
 src/benchmarks/beam/index.ts        | 18 +++++-
 src/orchestrator/index.ts           |  3 +-
 src/orchestrator/phases/evaluate.ts | 83 ++++++++++++++++++++++--
 src/orchestrator/phases/indexing.ts | 99 ++++++++++++++++++-----------
 src/orchestrator/phases/ingest.ts   | 81 +++++++++++++++--------
 src/orchestrator/phases/report.ts   |  1 +
 src/orchestrator/phases/search.ts   |  2 +-
 src/prompts/beam.ts                 | 89 ++++++++++++++++++++++++++
 src/types/checkpoint.ts             |  1 +
 src/types/judge.ts                  |  1 +
 src/types/unified.ts                |  1 +
 11 files changed, 308 insertions(+), 71 deletions(-)
 create mode 100644 src/prompts/beam.ts

diff --git a/src/benchmarks/beam/index.ts b/src/benchmarks/beam/index.ts
index 3af5a93..1d70583 100644
--- a/src/benchmarks/beam/index.ts
+++ b/src/benchmarks/beam/index.ts
@@ -68,7 +68,7 @@ export const BEAM_QUESTION_TYPES: QuestionTypeRegistry = {
 function flattenChatFile(chatFile: BeamChatFile): BeamBatch[] {
   if (Array.isArray(chatFile)) {
     return chatFile.flatMap((entry) => {
-      if ("turns" in entry) return [entry]
+      if (isBeamBatch(entry)) return [entry]
       return flattenChatFile(entry)
     })
   }
@@ -78,7 +78,23 @@ function flattenChatFile(chatFile: BeamChatFile): BeamBatch[] {
     .flatMap((key) => chatFile[key] || [])
 }
 
+function isBeamBatch(value: unknown): value is BeamBatch {
+  return (
+    typeof value === "object" &&
+    value !== null &&
+    "batch_number" in value &&
+    "turns" in value &&
+    Array.isArray((value as BeamBatch).turns)
+  )
+}
+
 function createGroundTruth(question: unknown): string {
+  if (typeof question === "object" && question !== null) {
+    const record = question as Record<string, unknown>
+    const answer = getQuestionAnswer(record)
+    return answer || JSON.stringify(question)
+  }
+
   return JSON.stringify(question)
 }
 
diff --git a/src/orchestrator/index.ts b/src/orchestrator/index.ts
index 64578bb..4d2b2b0 100644
--- a/src/orchestrator/index.ts
+++ b/src/orchestrator/index.ts
@@ -237,7 +237,8 @@ export class Orchestrator {
         : allQuestions
 
       for (const q of questionsToInit) {
-        const containerTag = `${q.questionId}-${checkpoint.dataSourceRunId}`
+        const ingestionGroupId = benchmark.getIngestionGroupId?.(q.questionId) || q.questionId
+        const containerTag = `${ingestionGroupId}-${checkpoint.dataSourceRunId}`
         this.checkpointManager.initQuestion(checkpoint, q.questionId, containerTag, {
           question: q.question,
           groundTruth: q.groundTruth,
diff --git a/src/orchestrator/phases/evaluate.ts b/src/orchestrator/phases/evaluate.ts
index a36205f..26c990d 100644
--- a/src/orchestrator/phases/evaluate.ts
+++ b/src/orchestrator/phases/evaluate.ts
@@ -2,11 +2,76 @@ import type { Judge } from "../../types/judge"
 import type { Benchmark } from "../../types/benchmark"
 import type { RunCheckpoint } from "../../types/checkpoint"
 import type { Provider } from "../../types/provider"
+import { generateText } from "ai"
 import { CheckpointManager } from "../checkpoint"
 import { logger } from "../../utils/logger"
 import { ConcurrentExecutor } from "../concurrent"
 import { resolveConcurrency } from "../../types/concurrency"
 import { calculateRetrievalMetrics } from "./retrieval-eval"
+import { buildBeamRubricJudgePrompt, parseBeamRubricJudgeResponse } from "../../prompts/beam"
+
+interface BeamRubricItemResult {
+  rubricItem: string
+  score: number
+  reason: string
+}
+
+function getBeamRubric(question: { metadata?: Record<string, unknown> }): string[] | null {
+  const rubric = question.metadata?.rubric
+  if (!Array.isArray(rubric) || rubric.some((item) => typeof item !== "string")) {
+    return null
+  }
+
+  return rubric
+}
+
+async function evaluateBeamRubricQuestion(
+  judge: Judge,
+  question: { question: string; metadata?: Record<string, unknown> },
+  hypothesis: string
+): Promise<{ score: number; label: "correct" | "incorrect"; explanation: string; details: Record<string, unknown> }> {
+  const rubric = getBeamRubric(question)
+  if (!rubric) {
+    return {
+      score: 0,
+      label: "incorrect",
+      explanation: "Missing BEAM rubric metadata",
+      details: {},
+    }
+  }
+
+  const model = judge.getModel()
+  const results: BeamRubricItemResult[] = []
+
+  for (const rubricItem of rubric) {
+    const prompt = buildBeamRubricJudgePrompt(question.question, rubricItem, hypothesis)
+    const { text } = await generateText({
+      model,
+      prompt,
+      maxOutputTokens: 512,
+      temperature: 0,
+    })
+    const parsed = parseBeamRubricJudgeResponse(text)
+    results.push({
+      rubricItem,
+      score: parsed.score,
+      reason: parsed.reason,
+    })
+  }
+
+  const averageScore =
+    results.length > 0 ? results.reduce((sum, item) => sum + item.score, 0) / results.length : 0
+
+  return {
+    score: averageScore,
+    label: averageScore >= 1 ? "correct" : "incorrect",
+    explanation: `BEAM rubric average score: ${averageScore.toFixed(2)}`,
+    details: {
+      rubricResults: results,
+      rubricAverageScore: averageScore,
+    },
+  }
+}
 
 export async function runEvaluatePhase(
   judge: Judge,
@@ -55,15 +120,18 @@ export async function runEvaluatePhase(
 
       try {
         const searchResults = checkpoint.questions[question.questionId].phases.search.results || []
+        const rubric = getBeamRubric(question)
 
         const [result, retrievalMetrics] = await Promise.all([
-          judge.evaluate({
-            question: question.question,
-            questionType: question.questionType,
-            groundTruth: question.groundTruth,
-            hypothesis,
-            providerPrompts: provider?.prompts,
-          }),
+          rubric
+            ? evaluateBeamRubricQuestion(judge, question, hypothesis)
+            : judge.evaluate({
+                question: question.question,
+                questionType: question.questionType,
+                groundTruth: question.groundTruth,
+                hypothesis,
+                providerPrompts: provider?.prompts,
+              }),
           calculateRetrievalMetrics(
             judge.getModel(),
             question.question,
@@ -78,6 +146,7 @@ export async function runEvaluatePhase(
           score: result.score,
           label: result.label,
           explanation: result.explanation,
+          details: result.details,
           retrievalMetrics,
           completedAt: new Date().toISOString(),
           durationMs,
diff --git a/src/orchestrator/phases/indexing.ts b/src/orchestrator/phases/indexing.ts
index 0304289..bd822fc 100644
--- a/src/orchestrator/phases/indexing.ts
+++ b/src/orchestrator/phases/indexing.ts
@@ -107,43 +107,64 @@ export async function runIndexingPhase(
 
   const concurrency = resolveConcurrency("indexing", checkpoint.concurrency, provider.concurrency)
 
-  const tracker = new IndexingProgressTracker(toIndex)
+  const questionsByContainer = new Map<string, typeof toIndex>()
+  for (const question of toIndex) {
+    const questions = questionsByContainer.get(question.containerTag) || []
+    questions.push(question)
+    questionsByContainer.set(question.containerTag, questions)
+  }
+
+  const indexingGroups = Array.from(questionsByContainer.entries()).map(
+    ([containerTag, questions]) => ({
+      containerTag,
+      questions,
+      representative: questions[0],
+    })
+  )
+
+  const tracker = new IndexingProgressTracker(indexingGroups.map((group) => group.representative))
   const totalEpisodes = tracker.getTotalEpisodes()
 
   logger.info(
-    `Awaiting indexing for ${toIndex.length} questions, ${totalEpisodes} episodes (concurrency: ${concurrency})...`
+    `Awaiting indexing for ${toIndex.length} questions across ${indexingGroups.length} containers, ${totalEpisodes} episodes (concurrency: ${concurrency})...`
   )
 
   tracker.display()
 
   await ConcurrentExecutor.execute(
-    toIndex,
+    indexingGroups,
     concurrency,
     checkpoint.runId,
     "indexing",
-    async ({ item: question }) => {
+    async ({ item: group }) => {
+      const question = group.representative
+      const groupQuestionIds = group.questions.map((q) => q.questionId)
       const ingestResult = question.phases.ingest.ingestResult
       const episodeCount = getEpisodeCount(question)
 
       if (!ingestResult || episodeCount === 0) {
-        checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", {
-          status: "completed",
-          completedIds: [],
-          failedIds: [],
-          completedAt: new Date().toISOString(),
-          durationMs: 0,
-        })
+        for (const questionId of groupQuestionIds) {
+          checkpointManager.updatePhase(checkpoint, questionId, "indexing", {
+            status: "completed",
+            completedIds: [],
+            failedIds: [],
+            completedAt: new Date().toISOString(),
+            durationMs: 0,
+          })
+        }
         tracker.markQuestionDone(question.questionId)
         return { questionId: question.questionId, durationMs: 0 }
       }
 
       const startTime = Date.now()
-      checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", {
-        status: "in_progress",
-        completedIds: [],
-        failedIds: [],
-        startedAt: new Date().toISOString(),
-      })
+      for (const questionId of groupQuestionIds) {
+        checkpointManager.updatePhase(checkpoint, questionId, "indexing", {
+          status: "in_progress",
+          completedIds: [],
+          failedIds: [],
+          startedAt: new Date().toISOString(),
+        })
+      }
 
       try {
         let lastProgress: IndexingProgress = {
@@ -152,36 +173,42 @@ export async function runIndexingPhase(
           total: episodeCount,
         }
 
-        await provider.awaitIndexing(ingestResult, question.containerTag, (progress) => {
+        await provider.awaitIndexing(ingestResult, group.containerTag, (progress) => {
           lastProgress = progress
           tracker.update(question.questionId, progress)
 
-          checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", {
-            status: "in_progress",
-            completedIds: progress.completedIds,
-            failedIds: progress.failedIds,
-          })
+          for (const questionId of groupQuestionIds) {
+            checkpointManager.updatePhase(checkpoint, questionId, "indexing", {
+              status: "in_progress",
+              completedIds: progress.completedIds,
+              failedIds: progress.failedIds,
+            })
+          }
         })
 
         const durationMs = Date.now() - startTime
-        checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", {
-          status: "completed",
-          completedIds: lastProgress.completedIds,
-          failedIds: lastProgress.failedIds,
-          completedAt: new Date().toISOString(),
-          durationMs,
-        })
+        for (const questionId of groupQuestionIds) {
+          checkpointManager.updatePhase(checkpoint, questionId, "indexing", {
+            status: "completed",
+            completedIds: lastProgress.completedIds,
+            failedIds: lastProgress.failedIds,
+            completedAt: new Date().toISOString(),
+            durationMs,
+          })
+        }
 
         return { questionId: question.questionId, durationMs }
       } catch (e) {
         const error = e instanceof Error ? e.message : String(e)
-        checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", {
-          status: "failed",
-          error,
-        })
-        logger.error(`\nFailed to index ${question.questionId}: ${error}`)
+        for (const questionId of groupQuestionIds) {
+          checkpointManager.updatePhase(checkpoint, questionId, "indexing", {
+            status: "failed",
+            error,
+          })
+        }
+        logger.error(`\nFailed to index ${group.containerTag}: ${error}`)
         throw new Error(
-          `Indexing failed at ${question.questionId}: ${error}. Fix the issue and resume with the same run ID.`
+          `Indexing failed at ${group.containerTag}: ${error}. Fix the issue and resume with the same run ID.`
         )
       }
     }
diff --git a/src/orchestrator/phases/ingest.ts b/src/orchestrator/phases/ingest.ts
index e38815d..2f93f70 100644
--- a/src/orchestrator/phases/ingest.ts
+++ b/src/orchestrator/phases/ingest.ts
@@ -32,16 +32,34 @@ export async function runIngestPhase(
 
   const concurrency = resolveConcurrency("ingest", checkpoint.concurrency, provider.concurrency)
 
-  logger.info(`Ingesting ${pendingQuestions.length} questions (concurrency: ${concurrency})...`)
+  const questionsByGroup = new Map<string, typeof pendingQuestions>()
+  for (const question of pendingQuestions) {
+    const groupId = benchmark.getIngestionGroupId?.(question.questionId) || question.questionId
+    const questions = questionsByGroup.get(groupId) || []
+    questions.push(question)
+    questionsByGroup.set(groupId, questions)
+  }
+
+  const ingestionGroups = Array.from(questionsByGroup.entries()).map(([groupId, questions]) => ({
+    groupId,
+    questions,
+    representative: questions[0],
+  }))
+
+  logger.info(
+    `Ingesting ${pendingQuestions.length} questions across ${ingestionGroups.length} containers (concurrency: ${concurrency})...`
+  )
 
   await ConcurrentExecutor.executeBatched({
-    items: pendingQuestions,
+    items: ingestionGroups,
     concurrency,
     rateLimitMs: RATE_LIMIT_MS,
     runId: checkpoint.runId,
     phaseName: "ingest",
-    executeTask: async ({ item: question, index, total }) => {
-      const containerTag = `${question.questionId}-${checkpoint.dataSourceRunId}`
+    executeTask: async ({ item: group, index, total }) => {
+      const question = group.representative
+      const groupQuestionIds = group.questions.map((q) => q.questionId)
+      const containerTag = checkpoint.questions[question.questionId].containerTag
       const sessions = benchmark.getHaystackSessions(question.questionId)
 
       const sessionsMetadata = sessions.map((s) => ({
@@ -49,13 +67,17 @@ export async function runIngestPhase(
         date: s.metadata?.date as string | undefined,
         messageCount: s.messages.length,
       }))
-      checkpointManager.updateSessions(checkpoint, question.questionId, sessionsMetadata)
+      for (const questionId of groupQuestionIds) {
+        checkpointManager.updateSessions(checkpoint, questionId, sessionsMetadata)
+      }
 
       const startTime = Date.now()
-      checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", {
-        status: "in_progress",
-        startedAt: new Date().toISOString(),
-      })
+      for (const questionId of groupQuestionIds) {
+        checkpointManager.updatePhase(checkpoint, questionId, "ingest", {
+          status: "in_progress",
+          startedAt: new Date().toISOString(),
+        })
+      }
 
       try {
         const completedSessions =
@@ -75,9 +97,11 @@ export async function runIngestPhase(
           }
 
           completedSessions.push(session.sessionId)
-          checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", {
-            completedSessions,
-          })
+          for (const questionId of groupQuestionIds) {
+            checkpointManager.updatePhase(checkpoint, questionId, "ingest", {
+              completedSessions: [...completedSessions],
+            })
+          }
         }
 
         if (combinedResult.taskIds && combinedResult.taskIds.length === 0) {
@@ -99,25 +123,32 @@ export async function runIngestPhase(
         }
 
         const durationMs = Date.now() - startTime
-        checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", {
-          status: "completed",
-          ingestResult: combinedResult,
-          completedAt: new Date().toISOString(),
-          durationMs,
-        })
+        for (const questionId of groupQuestionIds) {
+          checkpointManager.updatePhase(checkpoint, questionId, "ingest", {
+            status: "completed",
+            ingestResult: {
+              documentIds: [...combinedResult.documentIds],
+              ...(combinedResult.taskIds ? { taskIds: [...combinedResult.taskIds] } : {}),
+            },
+            completedAt: new Date().toISOString(),
+            durationMs,
+          })
+        }
 
-        logger.progress(index + 1, total, `Ingested ${question.questionId} (${durationMs}ms)`)
+        logger.progress(index + 1, total, `Ingested ${group.groupId} (${durationMs}ms)`)
 
         return { questionId: question.questionId, durationMs }
       } catch (e) {
         const error = e instanceof Error ? e.message : String(e)
-        checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", {
-          status: "failed",
-          error,
-        })
-        logger.error(`Failed to ingest ${question.questionId}: ${error}`)
+        for (const questionId of groupQuestionIds) {
+          checkpointManager.updatePhase(checkpoint, questionId, "ingest", {
+            status: "failed",
+            error,
+          })
+        }
+        logger.error(`Failed to ingest ${group.groupId}: ${error}`)
         throw new Error(
-          `Ingest failed at ${question.questionId}: ${error}. Fix the issue and resume with the same run ID.`
+          `Ingest failed at ${group.groupId}: ${error}. Fix the issue and resume with the same run ID.`
         )
       }
     },
diff --git a/src/orchestrator/phases/report.ts b/src/orchestrator/phases/report.ts
index 4ec9aab..43b2102 100644
--- a/src/orchestrator/phases/report.ts
+++ b/src/orchestrator/phases/report.ts
@@ -133,6 +133,7 @@ export function generateReport(benchmark: Benchmark, checkpoint: RunCheckpoint):
       answerDurationMs,
       totalDurationMs,
       retrievalMetrics,
+      details: evalPhase.details,
     })
 
     if (retrievalMetrics) {
diff --git a/src/orchestrator/phases/search.ts b/src/orchestrator/phases/search.ts
index 65e4ac7..c774ab8 100644
--- a/src/orchestrator/phases/search.ts
+++ b/src/orchestrator/phases/search.ts
@@ -46,7 +46,7 @@ export async function runSearchPhase(
     checkpoint.runId,
     "search",
     async ({ item: question, index, total }) => {
-      const containerTag = `${question.questionId}-${checkpoint.dataSourceRunId}`
+      const containerTag = checkpoint.questions[question.questionId].containerTag
 
       const startTime = Date.now()
       checkpointManager.updatePhase(checkpoint, question.questionId, "search", {
diff --git a/src/prompts/beam.ts b/src/prompts/beam.ts
new file mode 100644
index 0000000..625252a
--- /dev/null
+++ b/src/prompts/beam.ts
@@ -0,0 +1,89 @@
+export interface BeamRubricJudgeResult {
+  score: number
+  reason: string
+}
+
+function parseJsonResponse(response: string): Record<string, unknown> {
+  const trimmed = response.trim()
+
+  if (trimmed.startsWith("```")) {
+    const codeFenceMatch = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/)
+    if (codeFenceMatch?.[1]) {
+      return JSON.parse(codeFenceMatch[1])
+    }
+  }
+
+  const jsonMatch = trimmed.match(/\{[\s\S]*\}/)
+  if (jsonMatch) {
+    return JSON.parse(jsonMatch[0])
+  }
+
+  return JSON.parse(trimmed)
+}
+
+export function buildBeamRubricJudgePrompt(
+  question: string,
+  rubricItem: string,
+  llmResponse: string
+): string {
+  return `You are an expert evaluator tasked with judging whether the LLM's response demonstrates compliance with the specified RUBRIC CRITERION.
+
+## EVALUATION INPUTS
+- QUESTION (what the user asked): ${question}
+- RUBRIC CRITERION (what to check): ${rubricItem}
+- RESPONSE TO EVALUATE: ${llmResponse}
+
+## EVALUATION RUBRIC:
+The rubric defines a specific requirement, constraint, or expected behavior that the LLM response should demonstrate.
+
+**IMPORTANT**: Pay careful attention to whether the rubric specifies:
+- **Positive requirements** (things the response SHOULD include/do)
+- **Negative constraints** (things the response SHOULD NOT include/do, often indicated by "no", "not", "avoid", "absent")
+
+## RESPONSIVENESS REQUIREMENT (anchored to the QUESTION)
+A compliant response must be **on-topic with respect to the QUESTION** and attempt to answer it.
+- If the response does not address the QUESTION, score **0.0** and stop.
+- For negative constraints, both must hold: (a) the response is responsive to the QUESTION, and (b) the prohibited element is absent.
+
+## SEMANTIC TOLERANCE RULES:
+Judge by meaning, not exact wording.
+- Accept paraphrases and synonyms that preserve intent.
+- Case/punctuation/whitespace differences must be ignored.
+- Numbers/currencies/dates may appear in equivalent forms when numerically equivalent.
+- If the rubric expects a number or duration, prefer normalized comparison over string matching.
+
+## STYLE NEUTRALITY:
+Ignore tone, politeness, length, and flourish unless the rubric explicitly requires a format/structure.
+- Do not penalize hedging, voice, or verbosity if content satisfies the rubric.
+- Only evaluate format when the rubric explicitly mandates it.
+
+## SCORING SCALE:
+- 1.0 (Complete Compliance): Fully complies with the rubric criterion.
+- 0.5 (Partial Compliance): Partially complies.
+- 0.0 (No Compliance): Fails to comply.
+
+## OUTPUT FORMAT:
+Return your evaluation in JSON format with two fields:
+{
+  "score": [your score: 1.0, 0.5, or 0.0],
+  "reason": "[detailed explanation of whether the rubric criterion was satisfied and why]"
+}
+
+NOTE: ONLY output the JSON object, without any explanation before or after that`
+}
+
+export function parseBeamRubricJudgeResponse(response: string): BeamRubricJudgeResult {
+  try {
+    const parsed = parseJsonResponse(response)
+    const score = typeof parsed.score === "number" ? parsed.score : Number(parsed.score)
+    return {
+      score: Number.isFinite(score) ? score : 0,
+      reason: typeof parsed.reason === "string" ? parsed.reason : "",
+    }
+  } catch {
+    return {
+      score: 0,
+      reason: "Failed to parse judge response",
+    }
+  }
+}
diff --git a/src/types/checkpoint.ts b/src/types/checkpoint.ts
index f8f1180..b3ff572 100644
--- a/src/types/checkpoint.ts
+++ b/src/types/checkpoint.ts
@@ -70,6 +70,7 @@ export interface EvaluatePhaseCheckpoint {
   score?: number
   explanation?: string
   retrievalMetrics?: RetrievalMetrics
+  details?: Record<string, unknown>
   startedAt?: string
   completedAt?: string
   durationMs?: number
diff --git a/src/types/judge.ts b/src/types/judge.ts
index cf8bcb3..aef64aa 100644
--- a/src/types/judge.ts
+++ b/src/types/judge.ts
@@ -20,6 +20,7 @@ export interface JudgeResult {
   score: number
   label: "correct" | "incorrect"
   explanation: string
+  details?: Record<string, unknown>
 }
 
 export interface Judge {
diff --git a/src/types/unified.ts b/src/types/unified.ts
index e4a0deb..e5ef939 100644
--- a/src/types/unified.ts
+++ b/src/types/unified.ts
@@ -66,6 +66,7 @@ export interface EvaluationResult {
   answerDurationMs: number
   totalDurationMs: number
   retrievalMetrics?: RetrievalMetrics
+  details?: Record<string, unknown>
 }
 
 export interface LatencyStats {

From 110237ae864d9701b01638a2a8afaa843d6b7f80 Mon Sep 17 00:00:00 2001
From: Sreeram Sreedhar <ssreedh9@asu.edu>
Date: Sun, 24 May 2026 00:51:41 -0400
Subject: [PATCH 3/4] fixed date issue

---
 src/benchmarks/beam/index.ts      |  38 +++-
 src/orchestrator/phases/answer.ts |  21 ++-
 src/prompts/beam.ts               | 276 ++++++++++++++++++++++++------
 3 files changed, 282 insertions(+), 53 deletions(-)

diff --git a/src/benchmarks/beam/index.ts b/src/benchmarks/beam/index.ts
index 1d70583..6fe6854 100644
--- a/src/benchmarks/beam/index.ts
+++ b/src/benchmarks/beam/index.ts
@@ -8,6 +8,7 @@ import type {
   UnifiedSession,
 } from "../../types/unified"
 import { logger } from "../../utils/logger"
+import { formatBeamDate, parseBeamTimeAnchor } from "../../prompts/beam"
 import type { BeamBatch, BeamChatFile, BeamProbingQuestionsFile, BeamScale } from "./types"
 
 const DEFAULT_DATA_PATH = "./data/benchmarks/beam/chats"
@@ -92,7 +93,17 @@ function createGroundTruth(question: unknown): string {
   if (typeof question === "object" && question !== null) {
     const record = question as Record<string, unknown>
     const answer = getQuestionAnswer(record)
-    return answer || JSON.stringify(question)
+    if (answer) return answer
+
+    // Fall back to the rubric so retrieval-eval gets a useful expected-answer
+    // signal for types like instruction_following/preference_following that
+    // describe expected behavior via rubric items instead of a single answer.
+    const rubric = record.rubric
+    if (Array.isArray(rubric) && rubric.every((item) => typeof item === "string")) {
+      return rubric.join("\n")
+    }
+
+    return JSON.stringify(question)
   }
 
   return JSON.stringify(question)
@@ -206,6 +217,24 @@ export class BeamBenchmark implements Benchmark {
     const sessions: UnifiedSession[] = []
 
     for (const batch of batches) {
+      // mem0's `get_time_anchor_epoch` finds the earliest non-null `time_anchor`
+      // across all messages in a batch and tags every memory derived from that
+      // batch with it. Most turns in BEAM don't carry their own anchor, so
+      // hoisting the batch-level anchor here gives dates to every session in
+      // the batch (matching mem0's per-memory dating).
+      let batchTimeAnchor: string | null | undefined = batch.time_anchor ?? null
+      if (!batchTimeAnchor) {
+        for (const turn of batch.turns) {
+          const msgWithAnchor = turn.find((m) => m.time_anchor)
+          if (msgWithAnchor?.time_anchor) {
+            batchTimeAnchor = msgWithAnchor.time_anchor
+            break
+          }
+        }
+      }
+      const batchDateIso = parseBeamTimeAnchor(batchTimeAnchor)
+      const batchDateFormatted = batchDateIso ? formatBeamDate(batchDateIso) : undefined
+
       for (let turnIndex = 0; turnIndex < batch.turns.length; turnIndex++) {
         const turn = batch.turns[turnIndex]
         const messages = turn
@@ -229,7 +258,12 @@ export class BeamBenchmark implements Benchmark {
             chatId,
             batchNumber: batch.batch_number,
             turnIndex: turnIndex + 1,
-            timeAnchor: batch.time_anchor,
+            // Match LocoMo / LongMemEval: `date` (ISO) + `formattedDate`
+            // (readable). The Supermemory provider reads these fields to
+            // (a) attach `metadata.date` to the document and (b) prefix the
+            // ingested content with a natural-language date sentence.
+            date: batchDateIso,
+            formattedDate: batchDateFormatted,
           },
         })
       }
diff --git a/src/orchestrator/phases/answer.ts b/src/orchestrator/phases/answer.ts
index d9e6df7..4cfc92a 100644
--- a/src/orchestrator/phases/answer.ts
+++ b/src/orchestrator/phases/answer.ts
@@ -11,6 +11,7 @@ import { config } from "../../utils/config"
 import { logger } from "../../utils/logger"
 import { getModelConfig, ModelConfig, DEFAULT_ANSWERING_MODEL } from "../../utils/models"
 import { buildDefaultAnswerPrompt } from "../../prompts/defaults"
+import { buildBeamAnswerPrompt } from "../../prompts/beam"
 import { buildContextString } from "../../types/prompts"
 import { ConcurrentExecutor } from "../concurrent"
 import { resolveConcurrency } from "../../types/concurrency"
@@ -119,8 +120,24 @@ export async function runAnswerPhase(
         const context: unknown[] = searchData.results || []
         const questionDate = checkpoint.questions[question.questionId]?.questionDate
 
-        const basePrompt = buildAnswerPrompt(question.question, [], questionDate, provider)
-        const prompt = buildAnswerPrompt(question.question, context, questionDate, provider)
+        // BEAM uses mem0's answer-generation prompt verbatim so retrieval +
+        // judging numbers are apples-to-apples with their published BEAM
+        // results. The provider-specific prompt is intentionally bypassed.
+        let basePrompt: string
+        let prompt: string
+        if (benchmark.name.startsWith("beam")) {
+          const sessions = benchmark.getHaystackSessions(question.questionId)
+          const sessionDateMap = new Map<string, string>()
+          for (const s of sessions) {
+            const date = s.metadata?.date
+            if (typeof date === "string") sessionDateMap.set(s.sessionId, date)
+          }
+          basePrompt = buildBeamAnswerPrompt(question.question, [], sessionDateMap)
+          prompt = buildBeamAnswerPrompt(question.question, context, sessionDateMap)
+        } else {
+          basePrompt = buildAnswerPrompt(question.question, [], questionDate, provider)
+          prompt = buildAnswerPrompt(question.question, context, questionDate, provider)
+        }
 
         const basePromptTokens = countTokens(basePrompt, modelConfig)
         const promptTokens = countTokens(prompt, modelConfig)
diff --git a/src/prompts/beam.ts b/src/prompts/beam.ts
index 625252a..b91f113 100644
--- a/src/prompts/beam.ts
+++ b/src/prompts/beam.ts
@@ -3,6 +3,157 @@ export interface BeamRubricJudgeResult {
   reason: string
 }
 
+/**
+ * Number of memories to retrieve per BEAM search. Matches mem0's evaluation
+ * cutoff exactly (their --top-k-cutoffs default) so the answering model sees
+ * the same context budget. mem0 retrieves 200 and trims to 100; we retrieve
+ * 100 directly since we evaluate at a single cutoff.
+ */
+export const BEAM_SEARCH_TOP_K = 100
+
+/**
+ * How many of the retrieved memories to expose to the answering model.
+ * Mirrors BEAM_SEARCH_TOP_K — every retrieved memory is shown.
+ */
+export const BEAM_ANSWER_TOP_K = 100
+
+const MONTH_NAMES = [
+  "January",
+  "February",
+  "March",
+  "April",
+  "May",
+  "June",
+  "July",
+  "August",
+  "September",
+  "October",
+  "November",
+  "December",
+]
+
+const MONTH_TO_NUMBER: Record<string, string> = Object.fromEntries(
+  MONTH_NAMES.map((name, i) => [name.toLowerCase(), String(i + 1).padStart(2, "0")])
+)
+
+/**
+ * Parse BEAM's per-batch time_anchor strings into ISO YYYY-MM-DD. BEAM stores
+ * these as "Month-DD-YYYY" (e.g. "March-01-2024"); some batches have a null
+ * anchor, in which case we return undefined and the memory line is rendered
+ * without a date prefix (matching mem0's "if no created_at" branch).
+ */
+export function parseBeamTimeAnchor(anchor: unknown): string | undefined {
+  if (typeof anchor !== "string") return undefined
+  const m = anchor.match(/^(\w+)-(\d{1,2})-(\d{4})$/)
+  if (!m) return undefined
+  const [, monthName, dayStr, year] = m
+  const month = MONTH_TO_NUMBER[monthName.toLowerCase()]
+  if (!month) return undefined
+  return `${year}-${month}-${dayStr.padStart(2, "0")}`
+}
+
+/**
+ * Human-readable form of a BEAM ISO date, used as `formattedDate` on session
+ * metadata so the Supermemory provider includes a natural-language date prefix
+ * in the ingested content (mirrors the LocoMo / LongMemEval pattern).
+ */
+export function formatBeamDate(iso: string): string {
+  const [year, month, day] = iso.split("-")
+  const monthName = MONTH_NAMES[parseInt(month, 10) - 1]
+  if (!monthName) return iso
+  return `${monthName} ${parseInt(day, 10)}, ${year}`
+}
+
+interface BeamMemoryLike {
+  memory?: string
+  content?: string
+  metadata?: { sessionId?: string }
+}
+
+/**
+ * Ported verbatim from mem0's get_beam_answer_generation_prompt
+ * (mem0ai/memory-benchmarks/benchmarks/beam/prompts.py). Memories are sliced
+ * to BEAM_ANSWER_TOP_K, sorted chronologically (oldest first), numbered, and
+ * prefixed with their session date when available — exactly the format mem0's
+ * answering LLM sees on their published BEAM 1M / 10M numbers.
+ */
+export function buildBeamAnswerPrompt(
+  question: string,
+  memories: unknown[],
+  sessionDateMap: Map<string, string>
+): string {
+  const sliced = memories.slice(0, BEAM_ANSWER_TOP_K) as BeamMemoryLike[]
+  const memoriesText = formatBeamMemories(sliced, sessionDateMap)
+  return `You are an AI assistant with access to stored memories from prior conversations with a user.
+Use these memories to answer the following question as accurately and completely as possible.
+
+IMPORTANT RULES:
+1. Scan ALL provided memories before answering — do not stop after the first relevant one.
+2. If multiple memories contain relevant information, combine and cross-reference them.
+3. If the memories contain contradictory information, prefer the more recent one.
+4. If the memories don't contain enough information to answer, say exactly: "I don't have enough information to answer this question."
+5. For temporal questions: pay attention to dates and relative time references.
+6. For ordering questions: present events in chronological order.
+7. For preference questions: use the most recently stated preference.
+8. Be specific and direct — include exact names, dates, numbers, and details from the memories.
+9. Do NOT invent or assume information that isn't in the memories.
+
+QUESTION: ${question}
+
+RETRIEVED MEMORIES:
+${memoriesText}
+
+ANSWER:`
+}
+
+function formatBeamMemories(
+  memories: BeamMemoryLike[],
+  sessionDateMap: Map<string, string>
+): string {
+  if (memories.length === 0) return "(No memories available)"
+
+  // Resolve text + date per memory.
+  const items = memories.map((m) => {
+    const text =
+      typeof m?.memory === "string"
+        ? m.memory
+        : typeof m?.content === "string"
+          ? m.content
+          : JSON.stringify(m)
+    const sessionId =
+      typeof m?.metadata?.sessionId === "string" ? m.metadata.sessionId : ""
+    const date = sessionId ? sessionDateMap.get(sessionId) : undefined
+    return { text, sessionId, date }
+  })
+
+  // mem0 sorts by created_at ascending (oldest first). When date is missing
+  // we fall back to sessionId order, which is itself chronological for BEAM
+  // since sessionIds encode batch + turn ordinals.
+  items.sort((a, b) => {
+    const aKey = a.date || ""
+    const bKey = b.date || ""
+    if (aKey !== bKey) return aKey.localeCompare(bKey)
+    return a.sessionId.localeCompare(b.sessionId, undefined, { numeric: true })
+  })
+
+  return items
+    .map((item, i) => {
+      const prefix = item.date ? `[${item.date}] ` : ""
+      return `${i + 1}. ${prefix}${item.text}`
+    })
+    .join("\n")
+}
+
+/**
+ * Ported verbatim from mem0's BEAM benchmark setup
+ * (mem0ai/memory-benchmarks/benchmarks/beam/prompts.py) so our judging is
+ * apples-to-apples with their published BEAM 1M / 10M numbers.
+ */
+export const BEAM_JUDGE_SYSTEM_PROMPT =
+  "You are an expert evaluator assessing whether an AI assistant's response satisfies " +
+  "specific rubric criteria. You must be objective, fair, and consistent. " +
+  "Return ONLY valid JSON with the exact format requested."
+
 function parseJsonResponse(response: string): Record<string, unknown> {
   const trimmed = response.trim()
 
@@ -21,69 +172,96 @@ function parseJsonResponse(response: string): Record<string, unknown> {
   return JSON.parse(trimmed)
 }
 
+/**
+ * Ported from mem0's get_beam_nugget_judge_prompt. Each rubric "nugget" is
+ * scored independently on a 3-point scale by the judge LLM.
+ */
 export function buildBeamRubricJudgePrompt(
   question: string,
-  rubricItem: string,
+  nugget: string,
   llmResponse: string
 ): string {
-  return `You are an expert evaluator tasked with judging whether the LLM's response demonstrates compliance with the specified RUBRIC CRITERION.
-
-## EVALUATION INPUTS
-- QUESTION (what the user asked): ${question}
-- RUBRIC CRITERION (what to check): ${rubricItem}
-- RESPONSE TO EVALUATE: ${llmResponse}
-
-## EVALUATION RUBRIC:
-The rubric defines a specific requirement, constraint, or expected behavior that the LLM response should demonstrate.
-
-**IMPORTANT**: Pay careful attention to whether the rubric specifies:
-- **Positive requirements** (things the response SHOULD include/do)
-- **Negative constraints** (things the response SHOULD NOT include/do, often indicated by "no", "not", "avoid", "absent")
-
-## RESPONSIVENESS REQUIREMENT (anchored to the QUESTION)
-A compliant response must be **on-topic with respect to the QUESTION** and attempt to answer it.
-- If the response does not address the QUESTION, score **0.0** and stop.
-- For negative constraints, both must hold: (a) the response is responsive to the QUESTION, and (b) the prohibited element is absent.
-
-## SEMANTIC TOLERANCE RULES:
-Judge by meaning, not exact wording.
-- Accept paraphrases and synonyms that preserve intent.
-- Case/punctuation/whitespace differences must be ignored.
-- Numbers/currencies/dates may appear in equivalent forms when numerically equivalent.
-- If the rubric expects a number or duration, prefer normalized comparison over string matching.
-
-## STYLE NEUTRALITY:
-Ignore tone, politeness, length, and flourish unless the rubric explicitly requires a format/structure.
-- Do not penalize hedging, voice, or verbosity if content satisfies the rubric.
-- Only evaluate format when the rubric explicitly mandates it.
-
-## SCORING SCALE:
-- 1.0 (Complete Compliance): Fully complies with the rubric criterion.
-- 0.5 (Partial Compliance): Partially complies.
-- 0.0 (No Compliance): Fails to comply.
-
-## OUTPUT FORMAT:
-Return your evaluation in JSON format with two fields:
-{
-  "score": [your score: 1.0, 0.5, or 0.0],
-  "reason": "[detailed explanation of whether the rubric criterion was satisfied and why]"
+  return `Evaluate whether the following LLM response demonstrates compliance with the specified RUBRIC CRITERION.
+
+QUESTION:
+${question}
+
+LLM RESPONSE:
+${llmResponse}
+
+RUBRIC CRITERION:
+${nugget}
+
+SCORING GUIDELINES:
+
+First, determine whether the rubric criterion is a POSITIVE requirement (the response SHOULD include something) or a NEGATIVE constraint (the response SHOULD NOT include something).
+
+**For POSITIVE requirements** (response should contain, mention, or demonstrate something):
+- **1.0 (Complete Compliance)**: The required element is present, accurate, and complete. The response fully and clearly satisfies the rubric criterion.
+- **0.5 (Partial Compliance)**: The required element is partially present, has minor inaccuracies, or is incomplete. The core intent is present but not fully realized.
+- **0.0 (No Compliance)**: The required element is missing, incorrect, or the response is entirely off-topic / non-responsive.
+
+**For NEGATIVE constraints** (response should NOT contain or should avoid something):
+- **1.0 (Complete Compliance)**: The response is responsive to the question AND the prohibited element is absent.
+- **0.5 (Partial Compliance)**: The response is responsive but contains a borderline or ambiguous reference to the prohibited element.
+- **0.0 (No Compliance)**: The prohibited element is present in the response, OR the response is non-responsive (off-topic, refusal, empty).
+
+**Compound statement handling**: If the rubric criterion contains "and" or commas connecting multiple required elements:
+- All elements present and correct = 1.0
+- Some (but not all) elements present and correct = 0.5
+- No elements present or correct = 0.0
+
+EVALUATION RULES:
+1. **Semantic tolerance**: Paraphrases and synonyms are acceptable. The response does not need to use the exact same words as the rubric.
+2. **Numeric and date equivalence**: Treat equivalent representations as identical. "$68,000" = "68k" = "sixty-eight thousand dollars". "2 years" = "24 months". Prefer normalized comparison for numbers, currencies, dates, and durations.
+3. **Case / punctuation / whitespace tolerance**: Differences in capitalization, punctuation, and whitespace must be ignored when comparing content.
+4. **Hedging tolerance**: Do not penalize hedging language ("I think", "probably", "it seems"), passive voice, or verbosity if the substantive content satisfies the rubric criterion.
+5. **Style neutrality**: Do not penalize for tone, formatting, or length unless the rubric criterion specifically requires a particular format.
+6. **Responsiveness**: If the LLM response is completely off-topic or refuses to answer, score 0.0 for all criteria.
+7. **Independence**: Evaluate this criterion in isolation — do not consider other rubric items.
+8. **Specificity matters**: Vague or generic answers that could apply to any question score lower than specific, detailed answers.
+
+STEP-BY-STEP EVALUATION:
+Follow these steps in order:
+1. **Understand the Requirement**: Read the rubric criterion and classify it as a positive requirement or a negative constraint.
+2. **Parse Compound Statements**: If the criterion contains multiple sub-requirements joined by "and" or commas, identify each element separately.
+3. **Check Compliance**: Compare the LLM response against each element, applying the tolerance rules above (semantic, numeric, case, hedging).
+4. **Assign Score**: Use the appropriate scoring table (positive or negative) and compound-statement rule to determine the score.
+5. **Provide Reasoning**: Write a concise explanation referencing which elements were or were not satisfied.
+
+Return your evaluation as a JSON object with exactly two fields:
+{"score": <0.0 or 0.5 or 1.0>, "reason": "<one concise sentence explaining your score>"}`
 }
 
-NOTE: ONLY output the JSON object, without any explanation before or after that`
+/**
+ * Ported from mem0's _clamp_nugget_score. Snaps any numeric score the judge
+ * returns to the nearest of {0.0, 0.5, 1.0}, instead of penalizing on exact
+ * match failure.
+ */
+export function clampNuggetScore(raw: number): 0 | 0.5 | 1 {
+  if (!Number.isFinite(raw)) return 0
+  if (raw >= 0.75) return 1
+  if (raw >= 0.25) return 0.5
+  return 0
 }
 
+/**
+ * Ported from mem0's judge_single_nugget response handling. If JSON parsing
+ * works, clamp the score; otherwise fall back to scanning the raw text for
+ * "1.0" / "0.5" markers.
+ */
 export function parseBeamRubricJudgeResponse(response: string): BeamRubricJudgeResult {
   try {
     const parsed = parseJsonResponse(response)
-    const score = typeof parsed.score === "number" ? parsed.score : Number(parsed.score)
+    const raw = typeof parsed.score === "number" ? parsed.score : Number(parsed.score)
     return {
-      score: Number.isFinite(score) ? score : 0,
+      score: clampNuggetScore(raw),
       reason: typeof parsed.reason === "string" ? parsed.reason : "",
     }
   } catch {
-    return {
-      score: 0,
-      reason: "Failed to parse judge response",
-    }
+    const snippet = response.slice(0, 200)
+    if (snippet.includes("1.0")) return { score: 1, reason: snippet }
+    if (snippet.includes("0.5")) return { score: 0.5, reason: snippet }
+    return { score: 0, reason: `Parse error: ${snippet}` }
   }
 }

From 7064491d353ea343a45bd1b2f42badfd11b52ddf Mon Sep 17 00:00:00 2001
From: Sreeram Sreedhar <ssreedh9@asu.edu>
Date: Sun, 24 May 2026 01:15:50 -0400
Subject: [PATCH 4/4] add BEAM to CLI help and benchmark README

---
 README.md                |  2 +-
 src/benchmarks/README.md | 15 +++++++++++++++
 src/cli/index.ts         |  7 +++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ef60b67..9b0ac51 100644
--- a/README.md
+++ b/README.md
@@ -74,7 +74,7 @@ GOOGLE_API_KEY=
 
 ```
 -p, --provider         Memory provider (supermemory, mem0, zep)
--b, --benchmark        Benchmark (locomo, longmemeval, convomem)
+-b, --benchmark        Benchmark (locomo, longmemeval, convomem, beam-1m, beam-10m)
 -j, --judge            Judge model (gpt-4o, sonnet-4, gemini-2.5-flash, etc.)
 -r, --run-id           Run identifier (auto-generated if omitted)
 -m, --answering-model  Model for answer generation (default: gpt-4o)
diff --git a/src/benchmarks/README.md b/src/benchmarks/README.md
index 8d43cf9..e6b8dbc 100644
--- a/src/benchmarks/README.md
+++ b/src/benchmarks/README.md
@@ -36,6 +36,7 @@ interface Benchmark {
 | `locomo` | GitHub snap-research/locomo | Long context memory benchmark |
 | `longmemeval` | HuggingFace xiaowu0162/longmemeval-cleaned | Long-term memory evaluation |
 | `convomem` | HuggingFace Salesforce/ConvoMem | Conversational memory benchmark |
+| `beam-1m` / `beam-10m` | HuggingFace Mohammadta/BEAM | Beyond a Million Tokens benchmark (1M and 10M token tiers) |
 
 ## Question Types
 
@@ -67,3 +68,17 @@ interface Benchmark {
 | `changing_evidence` | change | Information updates |
 | `implicit_connection_evidence` | implicit | Implicit reasoning |
 | `abstention_evidence` | abstain | Unanswerable questions |
+
+### BEAM
+| Type | Alias | Description |
+|------|-------|-------------|
+| `abstention` | abstain | Withhold answers when evidence is missing |
+| `contradiction_resolution` | contradict | Detect and reconcile inconsistent statements |
+| `event_ordering` | order | Reconstruct event or information order |
+| `information_extraction` | extract | Recall entities and factual details |
+| `instruction_following` | instruction | Follow sustained user instructions |
+| `knowledge_update` | update | Retain updated facts over stale facts |
+| `multi_session_reasoning` | multi | Reason across non-adjacent dialogue segments |
+| `preference_following` | preference | Adapt to evolving user preferences |
+| `summarization` | summary | Summarize dialogue content |
+| `temporal_reasoning` | temporal | Reason about explicit and implicit time relations |
diff --git a/src/cli/index.ts b/src/cli/index.ts
index b3c29d6..e6ae3db 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -151,10 +151,17 @@ Available benchmark datasets for evaluation:
                  Tests: user facts, assistant facts, preferences, implicit connections
                  Source: HuggingFace Salesforce/ConvoMem (downloaded on first use)
 
+  beam           BEAM - Beyond a Million Tokens benchmark
+                 Tests: abstention, contradiction, event ordering, extraction, instructions, knowledge update, multi-session, preferences, summarization, temporal
+                 Source: HuggingFace Mohammadta/BEAM (downloaded on first use)
+                 Scales: beam-1m (700 q / 35 chats), beam-10m (200 q / 10 chats)
+
 Usage:
   -b locomo        Run LoCoMo benchmark
   -b longmemeval   Run LongMemEval benchmark
   -b convomem      Run ConvoMem benchmark
+  -b beam-1m       Run BEAM 1M-token tier
+  -b beam-10m      Run BEAM 10M-token tier
 `)
 }