From 2346e429a06b8d14ffa82bfe6da624906f505522 Mon Sep 17 00:00:00 2001 From: Sreeram Sreedhar Date: Fri, 22 May 2026 22:26:06 -0400 Subject: [PATCH 1/4] initial commit --- src/benchmarks/beam/index.ts | 273 +++++++++++++++++++++++++++++++++++ src/benchmarks/beam/types.ts | 31 ++++ src/benchmarks/index.ts | 6 +- src/types/benchmark.ts | 3 +- 4 files changed, 311 insertions(+), 2 deletions(-) create mode 100644 src/benchmarks/beam/index.ts create mode 100644 src/benchmarks/beam/types.ts diff --git a/src/benchmarks/beam/index.ts b/src/benchmarks/beam/index.ts new file mode 100644 index 0000000..3af5a93 --- /dev/null +++ b/src/benchmarks/beam/index.ts @@ -0,0 +1,273 @@ +import { existsSync, readFileSync, readdirSync } from "fs" +import { join } from "path" +import type { Benchmark, BenchmarkConfig, QuestionFilter } from "../../types/benchmark" +import type { + QuestionTypeRegistry, + UnifiedMessage, + UnifiedQuestion, + UnifiedSession, +} from "../../types/unified" +import { logger } from "../../utils/logger" +import type { BeamBatch, BeamChatFile, BeamProbingQuestionsFile, BeamScale } from "./types" + +const DEFAULT_DATA_PATH = "./data/benchmarks/beam/chats" + +export const BEAM_QUESTION_TYPES: QuestionTypeRegistry = { + abstention: { + id: "abstention", + alias: "abstain", + description: "Withhold answers when evidence is missing", + }, + contradiction_resolution: { + id: "contradiction_resolution", + alias: "contradict", + description: "Detect and reconcile inconsistent statements", + }, + event_ordering: { + id: "event_ordering", + alias: "order", + description: "Reconstruct event or information order", + }, + information_extraction: { + id: "information_extraction", + alias: "extract", + description: "Recall entities and factual details", + }, + instruction_following: { + id: "instruction_following", + alias: "instruction", + description: "Follow sustained user instructions", + }, + knowledge_update: { + id: "knowledge_update", + alias: "update", + description: "Retain updated facts over stale facts", + }, + multi_session_reasoning: { + id: "multi_session_reasoning", + alias: "multi", + description: "Reason across non-adjacent dialogue segments", + }, + preference_following: { + id: "preference_following", + alias: "preference", + description: "Adapt to evolving user preferences", + }, + summarization: { + id: "summarization", + alias: "summary", + description: "Summarize dialogue content", + }, + temporal_reasoning: { + id: "temporal_reasoning", + alias: "temporal", + description: "Reason about explicit and implicit time relations", + }, +} + +function flattenChatFile(chatFile: BeamChatFile): BeamBatch[] { + if (Array.isArray(chatFile)) { + return chatFile.flatMap((entry) => { + if ("turns" in entry) return [entry] + return flattenChatFile(entry) + }) + } + + return Object.keys(chatFile) + .sort((a, b) => a.localeCompare(b, undefined, { numeric: true })) + .flatMap((key) => chatFile[key] || []) +} + +function createGroundTruth(question: unknown): string { + return JSON.stringify(question) +} + +function getQuestionAnswer(question: Record): string | undefined { + const answer = + question.answer || question.ideal_answer || question.ideal_response || question.ideal_summary + return typeof answer === "string" ? answer : undefined +} + +export class BeamBenchmark implements Benchmark { + name: string + private scales: BeamScale[] + private questions: UnifiedQuestion[] = [] + private sessionsMap: Map = new Map() + private ingestionGroupMap: Map = new Map() + private dataPath: string = "" + + constructor(scales: BeamScale[] = ["1M", "10M"], name = "beam") { + this.scales = scales + this.name = name + } + + async load(config?: BenchmarkConfig): Promise { + this.dataPath = config?.dataPath || DEFAULT_DATA_PATH + const fullPath = join(process.cwd(), this.dataPath) + + if (!existsSync(fullPath)) { + throw new Error( + `BEAM dataset not found at ${fullPath}. Expected chats under ${DEFAULT_DATA_PATH}/{1M,10M}.` + ) + } + + for (const scale of this.scales) { + this.loadScale(fullPath, scale) + } + + logger.info( + `Loaded ${this.questions.length} questions from BEAM (${this.scales.join(", ")})` + ) + } + + private loadScale(basePath: string, scale: BeamScale): void { + const scalePath = join(basePath, scale) + if (!existsSync(scalePath)) { + throw new Error(`BEAM ${scale} dataset not found at ${scalePath}`) + } + + const chatDirs = readdirSync(scalePath, { withFileTypes: true }) + .filter((entry) => entry.isDirectory() && /^\d+$/.test(entry.name)) + .map((entry) => entry.name) + .sort((a, b) => Number(a) - Number(b)) + + for (const chatId of chatDirs) { + this.loadChat(scalePath, scale, chatId) + } + } + + private loadChat(scalePath: string, scale: BeamScale, chatId: string): void { + const chatDir = join(scalePath, chatId) + const truncatedPath = join(chatDir, "chat_trunecated.json") + const fullChatPath = join(chatDir, "chat.json") + const chatPath = existsSync(truncatedPath) ? truncatedPath : fullChatPath + const probingPath = join(chatDir, "probing_questions", "probing_questions.json") + + if (!existsSync(chatPath) || !existsSync(probingPath)) { + logger.warn(`Skipping BEAM ${scale}/${chatId}: missing chat or probing questions`) + return + } + + const batches = flattenChatFile(JSON.parse(readFileSync(chatPath, "utf8")) as BeamChatFile) + const sessions = this.extractSessions(scale, chatId, batches) + const probingQuestions = JSON.parse( + readFileSync(probingPath, "utf8") + ) as BeamProbingQuestionsFile + const sessionIds = sessions.map((session) => session.sessionId) + const ingestionGroupId = `beam-${scale}-${chatId}` + + for (const questionType of Object.keys(probingQuestions)) { + const questionsForType = probingQuestions[questionType] || [] + + for (let i = 0; i < questionsForType.length; i++) { + const probingQuestion = questionsForType[i] + const questionId = `${ingestionGroupId}-${questionType}-${i}` + const answer = getQuestionAnswer(probingQuestion) + + this.questions.push({ + questionId, + question: probingQuestion.question, + questionType, + groundTruth: createGroundTruth(probingQuestion), + haystackSessionIds: sessionIds, + metadata: { + scale, + chatId, + ingestionGroupId, + rubric: probingQuestion.rubric, + difficulty: probingQuestion.difficulty, + answer, + }, + }) + + this.sessionsMap.set(questionId, sessions) + this.ingestionGroupMap.set(questionId, ingestionGroupId) + } + } + } + + private extractSessions(scale: BeamScale, chatId: string, batches: BeamBatch[]): UnifiedSession[] { + const sessions: UnifiedSession[] = [] + + for (const batch of batches) { + for (let turnIndex = 0; turnIndex < batch.turns.length; turnIndex++) { + const turn = batch.turns[turnIndex] + const messages = turn + .filter((message) => message.content) + .map( + (message): UnifiedMessage => ({ + role: message.role, + content: message.content, + speaker: message.role, + timestamp: message.time_anchor, + }) + ) + + if (messages.length === 0) continue + + sessions.push({ + sessionId: `beam-${scale}-${chatId}-batch-${batch.batch_number}-turn-${turnIndex + 1}`, + messages, + metadata: { + scale, + chatId, + batchNumber: batch.batch_number, + turnIndex: turnIndex + 1, + timeAnchor: batch.time_anchor, + }, + }) + } + } + + return sessions + } + + getQuestions(filter?: QuestionFilter): UnifiedQuestion[] { + let result = [...this.questions] + + if (filter?.questionTypes?.length) { + result = result.filter((q) => filter.questionTypes!.includes(q.questionType)) + } + + if (filter?.offset) { + result = result.slice(filter.offset) + } + + if (filter?.limit) { + result = result.slice(0, filter.limit) + } + + return result + } + + getHaystackSessions(questionId: string): UnifiedSession[] { + return this.sessionsMap.get(questionId) || [] + } + + getGroundTruth(questionId: string): string { + const question = this.questions.find((q) => q.questionId === questionId) + return question?.groundTruth || "" + } + + getQuestionTypes(): QuestionTypeRegistry { + return BEAM_QUESTION_TYPES + } + + getIngestionGroupId(questionId: string): string { + return this.ingestionGroupMap.get(questionId) || questionId + } +} + +export class Beam1MBenchmark extends BeamBenchmark { + constructor() { + super(["1M"], "beam-1m") + } +} + +export class Beam10MBenchmark extends BeamBenchmark { + constructor() { + super(["10M"], "beam-10m") + } +} + +export default BeamBenchmark diff --git a/src/benchmarks/beam/types.ts b/src/benchmarks/beam/types.ts new file mode 100644 index 0000000..522cbb0 --- /dev/null +++ b/src/benchmarks/beam/types.ts @@ -0,0 +1,31 @@ +export type BeamScale = "1M" | "10M" + +export interface BeamMessage { + role: "user" | "assistant" + id?: number + content: string + time_anchor?: string + index?: string + question_type?: string +} + +export interface BeamBatch { + batch_number: number + time_anchor?: string | null + turns: BeamMessage[][] +} + +export type BeamChatFile = BeamBatch[] | Record | Record[] + +export interface BeamProbingQuestion { + question: string + rubric: string[] + difficulty?: string + answer?: string + ideal_answer?: string + ideal_response?: string + ideal_summary?: string + [key: string]: unknown +} + +export type BeamProbingQuestionsFile = Record diff --git a/src/benchmarks/index.ts b/src/benchmarks/index.ts index b790e87..56cd0c4 100644 --- a/src/benchmarks/index.ts +++ b/src/benchmarks/index.ts @@ -2,11 +2,15 @@ import type { Benchmark, BenchmarkName } from "../types/benchmark" import { LoCoMoBenchmark } from "./locomo" import { LongMemEvalBenchmark } from "./longmemeval" import { ConvoMemBenchmark } from "./convomem" +import { Beam1MBenchmark, Beam10MBenchmark, BeamBenchmark } from "./beam" const benchmarks: Record Benchmark> = { locomo: LoCoMoBenchmark, longmemeval: LongMemEvalBenchmark, convomem: ConvoMemBenchmark, + "beam-1m": Beam1MBenchmark, + "beam-10m": Beam10MBenchmark, + beam: BeamBenchmark, } export function createBenchmark(name: BenchmarkName): Benchmark { @@ -21,4 +25,4 @@ export function getAvailableBenchmarks(): BenchmarkName[] { return Object.keys(benchmarks) as BenchmarkName[] } -export { LoCoMoBenchmark, LongMemEvalBenchmark, ConvoMemBenchmark } +export { LoCoMoBenchmark, LongMemEvalBenchmark, ConvoMemBenchmark, BeamBenchmark } diff --git a/src/types/benchmark.ts b/src/types/benchmark.ts index 07961e4..7066f7b 100644 --- a/src/types/benchmark.ts +++ b/src/types/benchmark.ts @@ -18,6 +18,7 @@ export interface Benchmark { getHaystackSessions(questionId: string): UnifiedSession[] getGroundTruth(questionId: string): string getQuestionTypes(): QuestionTypeRegistry + getIngestionGroupId?(questionId: string): string } -export type BenchmarkName = "locomo" | "longmemeval" | "convomem" +export type BenchmarkName = "locomo" | "longmemeval" | "convomem" | "beam-1m" | "beam-10m" | "beam" From 5955c9b47399ec7354ccc317dc0d197132060b39 Mon Sep 17 00:00:00 2001 From: Sreeram Sreedhar Date: Fri, 22 May 2026 22:37:31 -0400 Subject: [PATCH 2/4] add beam benchmark integration --- src/benchmarks/beam/index.ts | 18 +++++- src/orchestrator/index.ts | 3 +- src/orchestrator/phases/evaluate.ts | 83 ++++++++++++++++++++++-- src/orchestrator/phases/indexing.ts | 99 ++++++++++++++++++----------- src/orchestrator/phases/ingest.ts | 81 +++++++++++++++-------- src/orchestrator/phases/report.ts | 1 + src/orchestrator/phases/search.ts | 2 +- src/prompts/beam.ts | 89 ++++++++++++++++++++++++++ src/types/checkpoint.ts | 1 + src/types/judge.ts | 1 + src/types/unified.ts | 1 + 11 files changed, 308 insertions(+), 71 deletions(-) create mode 100644 src/prompts/beam.ts diff --git a/src/benchmarks/beam/index.ts b/src/benchmarks/beam/index.ts index 3af5a93..1d70583 100644 --- a/src/benchmarks/beam/index.ts +++ b/src/benchmarks/beam/index.ts @@ -68,7 +68,7 @@ export const BEAM_QUESTION_TYPES: QuestionTypeRegistry = { function flattenChatFile(chatFile: BeamChatFile): BeamBatch[] { if (Array.isArray(chatFile)) { return chatFile.flatMap((entry) => { - if ("turns" in entry) return [entry] + if (isBeamBatch(entry)) return [entry] return flattenChatFile(entry) }) } @@ -78,7 +78,23 @@ function flattenChatFile(chatFile: BeamChatFile): BeamBatch[] { .flatMap((key) => chatFile[key] || []) } +function isBeamBatch(value: unknown): value is BeamBatch { + return ( + typeof value === "object" && + value !== null && + "batch_number" in value && + "turns" in value && + Array.isArray((value as BeamBatch).turns) + ) +} + function createGroundTruth(question: unknown): string { + if (typeof question === "object" && question !== null) { + const record = question as Record + const answer = getQuestionAnswer(record) + return answer || JSON.stringify(question) + } + return JSON.stringify(question) } diff --git a/src/orchestrator/index.ts b/src/orchestrator/index.ts index 64578bb..4d2b2b0 100644 --- a/src/orchestrator/index.ts +++ b/src/orchestrator/index.ts @@ -237,7 +237,8 @@ export class Orchestrator { : allQuestions for (const q of questionsToInit) { - const containerTag = `${q.questionId}-${checkpoint.dataSourceRunId}` + const ingestionGroupId = benchmark.getIngestionGroupId?.(q.questionId) || q.questionId + const containerTag = `${ingestionGroupId}-${checkpoint.dataSourceRunId}` this.checkpointManager.initQuestion(checkpoint, q.questionId, containerTag, { question: q.question, groundTruth: q.groundTruth, diff --git a/src/orchestrator/phases/evaluate.ts b/src/orchestrator/phases/evaluate.ts index a36205f..26c990d 100644 --- a/src/orchestrator/phases/evaluate.ts +++ b/src/orchestrator/phases/evaluate.ts @@ -2,11 +2,76 @@ import type { Judge } from "../../types/judge" import type { Benchmark } from "../../types/benchmark" import type { RunCheckpoint } from "../../types/checkpoint" import type { Provider } from "../../types/provider" +import { generateText } from "ai" import { CheckpointManager } from "../checkpoint" import { logger } from "../../utils/logger" import { ConcurrentExecutor } from "../concurrent" import { resolveConcurrency } from "../../types/concurrency" import { calculateRetrievalMetrics } from "./retrieval-eval" +import { buildBeamRubricJudgePrompt, parseBeamRubricJudgeResponse } from "../../prompts/beam" + +interface BeamRubricItemResult { + rubricItem: string + score: number + reason: string +} + +function getBeamRubric(question: { metadata?: Record }): string[] | null { + const rubric = question.metadata?.rubric + if (!Array.isArray(rubric) || rubric.some((item) => typeof item !== "string")) { + return null + } + + return rubric +} + +async function evaluateBeamRubricQuestion( + judge: Judge, + question: { question: string; metadata?: Record }, + hypothesis: string +): Promise<{ score: number; label: "correct" | "incorrect"; explanation: string; details: Record }> { + const rubric = getBeamRubric(question) + if (!rubric) { + return { + score: 0, + label: "incorrect", + explanation: "Missing BEAM rubric metadata", + details: {}, + } + } + + const model = judge.getModel() + const results: BeamRubricItemResult[] = [] + + for (const rubricItem of rubric) { + const prompt = buildBeamRubricJudgePrompt(question.question, rubricItem, hypothesis) + const { text } = await generateText({ + model, + prompt, + maxOutputTokens: 512, + temperature: 0, + }) + const parsed = parseBeamRubricJudgeResponse(text) + results.push({ + rubricItem, + score: parsed.score, + reason: parsed.reason, + }) + } + + const averageScore = + results.length > 0 ? results.reduce((sum, item) => sum + item.score, 0) / results.length : 0 + + return { + score: averageScore, + label: averageScore >= 1 ? "correct" : "incorrect", + explanation: `BEAM rubric average score: ${averageScore.toFixed(2)}`, + details: { + rubricResults: results, + rubricAverageScore: averageScore, + }, + } +} export async function runEvaluatePhase( judge: Judge, @@ -55,15 +120,18 @@ export async function runEvaluatePhase( try { const searchResults = checkpoint.questions[question.questionId].phases.search.results || [] + const rubric = getBeamRubric(question) const [result, retrievalMetrics] = await Promise.all([ - judge.evaluate({ - question: question.question, - questionType: question.questionType, - groundTruth: question.groundTruth, - hypothesis, - providerPrompts: provider?.prompts, - }), + rubric + ? evaluateBeamRubricQuestion(judge, question, hypothesis) + : judge.evaluate({ + question: question.question, + questionType: question.questionType, + groundTruth: question.groundTruth, + hypothesis, + providerPrompts: provider?.prompts, + }), calculateRetrievalMetrics( judge.getModel(), question.question, @@ -78,6 +146,7 @@ export async function runEvaluatePhase( score: result.score, label: result.label, explanation: result.explanation, + details: result.details, retrievalMetrics, completedAt: new Date().toISOString(), durationMs, diff --git a/src/orchestrator/phases/indexing.ts b/src/orchestrator/phases/indexing.ts index 0304289..bd822fc 100644 --- a/src/orchestrator/phases/indexing.ts +++ b/src/orchestrator/phases/indexing.ts @@ -107,43 +107,64 @@ export async function runIndexingPhase( const concurrency = resolveConcurrency("indexing", checkpoint.concurrency, provider.concurrency) - const tracker = new IndexingProgressTracker(toIndex) + const questionsByContainer = new Map() + for (const question of toIndex) { + const questions = questionsByContainer.get(question.containerTag) || [] + questions.push(question) + questionsByContainer.set(question.containerTag, questions) + } + + const indexingGroups = Array.from(questionsByContainer.entries()).map( + ([containerTag, questions]) => ({ + containerTag, + questions, + representative: questions[0], + }) + ) + + const tracker = new IndexingProgressTracker(indexingGroups.map((group) => group.representative)) const totalEpisodes = tracker.getTotalEpisodes() logger.info( - `Awaiting indexing for ${toIndex.length} questions, ${totalEpisodes} episodes (concurrency: ${concurrency})...` + `Awaiting indexing for ${toIndex.length} questions across ${indexingGroups.length} containers, ${totalEpisodes} episodes (concurrency: ${concurrency})...` ) tracker.display() await ConcurrentExecutor.execute( - toIndex, + indexingGroups, concurrency, checkpoint.runId, "indexing", - async ({ item: question }) => { + async ({ item: group }) => { + const question = group.representative + const groupQuestionIds = group.questions.map((q) => q.questionId) const ingestResult = question.phases.ingest.ingestResult const episodeCount = getEpisodeCount(question) if (!ingestResult || episodeCount === 0) { - checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", { - status: "completed", - completedIds: [], - failedIds: [], - completedAt: new Date().toISOString(), - durationMs: 0, - }) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "indexing", { + status: "completed", + completedIds: [], + failedIds: [], + completedAt: new Date().toISOString(), + durationMs: 0, + }) + } tracker.markQuestionDone(question.questionId) return { questionId: question.questionId, durationMs: 0 } } const startTime = Date.now() - checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", { - status: "in_progress", - completedIds: [], - failedIds: [], - startedAt: new Date().toISOString(), - }) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "indexing", { + status: "in_progress", + completedIds: [], + failedIds: [], + startedAt: new Date().toISOString(), + }) + } try { let lastProgress: IndexingProgress = { @@ -152,36 +173,42 @@ export async function runIndexingPhase( total: episodeCount, } - await provider.awaitIndexing(ingestResult, question.containerTag, (progress) => { + await provider.awaitIndexing(ingestResult, group.containerTag, (progress) => { lastProgress = progress tracker.update(question.questionId, progress) - checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", { - status: "in_progress", - completedIds: progress.completedIds, - failedIds: progress.failedIds, - }) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "indexing", { + status: "in_progress", + completedIds: progress.completedIds, + failedIds: progress.failedIds, + }) + } }) const durationMs = Date.now() - startTime - checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", { - status: "completed", - completedIds: lastProgress.completedIds, - failedIds: lastProgress.failedIds, - completedAt: new Date().toISOString(), - durationMs, - }) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "indexing", { + status: "completed", + completedIds: lastProgress.completedIds, + failedIds: lastProgress.failedIds, + completedAt: new Date().toISOString(), + durationMs, + }) + } return { questionId: question.questionId, durationMs } } catch (e) { const error = e instanceof Error ? e.message : String(e) - checkpointManager.updatePhase(checkpoint, question.questionId, "indexing", { - status: "failed", - error, - }) - logger.error(`\nFailed to index ${question.questionId}: ${error}`) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "indexing", { + status: "failed", + error, + }) + } + logger.error(`\nFailed to index ${group.containerTag}: ${error}`) throw new Error( - `Indexing failed at ${question.questionId}: ${error}. Fix the issue and resume with the same run ID.` + `Indexing failed at ${group.containerTag}: ${error}. Fix the issue and resume with the same run ID.` ) } } diff --git a/src/orchestrator/phases/ingest.ts b/src/orchestrator/phases/ingest.ts index e38815d..2f93f70 100644 --- a/src/orchestrator/phases/ingest.ts +++ b/src/orchestrator/phases/ingest.ts @@ -32,16 +32,34 @@ export async function runIngestPhase( const concurrency = resolveConcurrency("ingest", checkpoint.concurrency, provider.concurrency) - logger.info(`Ingesting ${pendingQuestions.length} questions (concurrency: ${concurrency})...`) + const questionsByGroup = new Map() + for (const question of pendingQuestions) { + const groupId = benchmark.getIngestionGroupId?.(question.questionId) || question.questionId + const questions = questionsByGroup.get(groupId) || [] + questions.push(question) + questionsByGroup.set(groupId, questions) + } + + const ingestionGroups = Array.from(questionsByGroup.entries()).map(([groupId, questions]) => ({ + groupId, + questions, + representative: questions[0], + })) + + logger.info( + `Ingesting ${pendingQuestions.length} questions across ${ingestionGroups.length} containers (concurrency: ${concurrency})...` + ) await ConcurrentExecutor.executeBatched({ - items: pendingQuestions, + items: ingestionGroups, concurrency, rateLimitMs: RATE_LIMIT_MS, runId: checkpoint.runId, phaseName: "ingest", - executeTask: async ({ item: question, index, total }) => { - const containerTag = `${question.questionId}-${checkpoint.dataSourceRunId}` + executeTask: async ({ item: group, index, total }) => { + const question = group.representative + const groupQuestionIds = group.questions.map((q) => q.questionId) + const containerTag = checkpoint.questions[question.questionId].containerTag const sessions = benchmark.getHaystackSessions(question.questionId) const sessionsMetadata = sessions.map((s) => ({ @@ -49,13 +67,17 @@ export async function runIngestPhase( date: s.metadata?.date as string | undefined, messageCount: s.messages.length, })) - checkpointManager.updateSessions(checkpoint, question.questionId, sessionsMetadata) + for (const questionId of groupQuestionIds) { + checkpointManager.updateSessions(checkpoint, questionId, sessionsMetadata) + } const startTime = Date.now() - checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", { - status: "in_progress", - startedAt: new Date().toISOString(), - }) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "ingest", { + status: "in_progress", + startedAt: new Date().toISOString(), + }) + } try { const completedSessions = @@ -75,9 +97,11 @@ export async function runIngestPhase( } completedSessions.push(session.sessionId) - checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", { - completedSessions, - }) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "ingest", { + completedSessions: [...completedSessions], + }) + } } if (combinedResult.taskIds && combinedResult.taskIds.length === 0) { @@ -99,25 +123,32 @@ export async function runIngestPhase( } const durationMs = Date.now() - startTime - checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", { - status: "completed", - ingestResult: combinedResult, - completedAt: new Date().toISOString(), - durationMs, - }) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "ingest", { + status: "completed", + ingestResult: { + documentIds: [...combinedResult.documentIds], + ...(combinedResult.taskIds ? { taskIds: [...combinedResult.taskIds] } : {}), + }, + completedAt: new Date().toISOString(), + durationMs, + }) + } - logger.progress(index + 1, total, `Ingested ${question.questionId} (${durationMs}ms)`) + logger.progress(index + 1, total, `Ingested ${group.groupId} (${durationMs}ms)`) return { questionId: question.questionId, durationMs } } catch (e) { const error = e instanceof Error ? e.message : String(e) - checkpointManager.updatePhase(checkpoint, question.questionId, "ingest", { - status: "failed", - error, - }) - logger.error(`Failed to ingest ${question.questionId}: ${error}`) + for (const questionId of groupQuestionIds) { + checkpointManager.updatePhase(checkpoint, questionId, "ingest", { + status: "failed", + error, + }) + } + logger.error(`Failed to ingest ${group.groupId}: ${error}`) throw new Error( - `Ingest failed at ${question.questionId}: ${error}. Fix the issue and resume with the same run ID.` + `Ingest failed at ${group.groupId}: ${error}. Fix the issue and resume with the same run ID.` ) } }, diff --git a/src/orchestrator/phases/report.ts b/src/orchestrator/phases/report.ts index 4ec9aab..43b2102 100644 --- a/src/orchestrator/phases/report.ts +++ b/src/orchestrator/phases/report.ts @@ -133,6 +133,7 @@ export function generateReport(benchmark: Benchmark, checkpoint: RunCheckpoint): answerDurationMs, totalDurationMs, retrievalMetrics, + details: evalPhase.details, }) if (retrievalMetrics) { diff --git a/src/orchestrator/phases/search.ts b/src/orchestrator/phases/search.ts index 65e4ac7..c774ab8 100644 --- a/src/orchestrator/phases/search.ts +++ b/src/orchestrator/phases/search.ts @@ -46,7 +46,7 @@ export async function runSearchPhase( checkpoint.runId, "search", async ({ item: question, index, total }) => { - const containerTag = `${question.questionId}-${checkpoint.dataSourceRunId}` + const containerTag = checkpoint.questions[question.questionId].containerTag const startTime = Date.now() checkpointManager.updatePhase(checkpoint, question.questionId, "search", { diff --git a/src/prompts/beam.ts b/src/prompts/beam.ts new file mode 100644 index 0000000..625252a --- /dev/null +++ b/src/prompts/beam.ts @@ -0,0 +1,89 @@ +export interface BeamRubricJudgeResult { + score: number + reason: string +} + +function parseJsonResponse(response: string): Record { + const trimmed = response.trim() + + if (trimmed.startsWith("```")) { + const codeFenceMatch = trimmed.match(/```(?:json)?\s*([\s\S]*?)\s*```/) + if (codeFenceMatch?.[1]) { + return JSON.parse(codeFenceMatch[1]) + } + } + + const jsonMatch = trimmed.match(/\{[\s\S]*\}/) + if (jsonMatch) { + return JSON.parse(jsonMatch[0]) + } + + return JSON.parse(trimmed) +} + +export function buildBeamRubricJudgePrompt( + question: string, + rubricItem: string, + llmResponse: string +): string { + return `You are an expert evaluator tasked with judging whether the LLM's response demonstrates compliance with the specified RUBRIC CRITERION. + +## EVALUATION INPUTS +- QUESTION (what the user asked): ${question} +- RUBRIC CRITERION (what to check): ${rubricItem} +- RESPONSE TO EVALUATE: ${llmResponse} + +## EVALUATION RUBRIC: +The rubric defines a specific requirement, constraint, or expected behavior that the LLM response should demonstrate. + +**IMPORTANT**: Pay careful attention to whether the rubric specifies: +- **Positive requirements** (things the response SHOULD include/do) +- **Negative constraints** (things the response SHOULD NOT include/do, often indicated by "no", "not", "avoid", "absent") + +## RESPONSIVENESS REQUIREMENT (anchored to the QUESTION) +A compliant response must be **on-topic with respect to the QUESTION** and attempt to answer it. +- If the response does not address the QUESTION, score **0.0** and stop. +- For negative constraints, both must hold: (a) the response is responsive to the QUESTION, and (b) the prohibited element is absent. + +## SEMANTIC TOLERANCE RULES: +Judge by meaning, not exact wording. +- Accept paraphrases and synonyms that preserve intent. +- Case/punctuation/whitespace differences must be ignored. +- Numbers/currencies/dates may appear in equivalent forms when numerically equivalent. +- If the rubric expects a number or duration, prefer normalized comparison over string matching. + +## STYLE NEUTRALITY: +Ignore tone, politeness, length, and flourish unless the rubric explicitly requires a format/structure. +- Do not penalize hedging, voice, or verbosity if content satisfies the rubric. +- Only evaluate format when the rubric explicitly mandates it. + +## SCORING SCALE: +- 1.0 (Complete Compliance): Fully complies with the rubric criterion. +- 0.5 (Partial Compliance): Partially complies. +- 0.0 (No Compliance): Fails to comply. + +## OUTPUT FORMAT: +Return your evaluation in JSON format with two fields: +{ + "score": [your score: 1.0, 0.5, or 0.0], + "reason": "[detailed explanation of whether the rubric criterion was satisfied and why]" +} + +NOTE: ONLY output the JSON object, without any explanation before or after that` +} + +export function parseBeamRubricJudgeResponse(response: string): BeamRubricJudgeResult { + try { + const parsed = parseJsonResponse(response) + const score = typeof parsed.score === "number" ? parsed.score : Number(parsed.score) + return { + score: Number.isFinite(score) ? score : 0, + reason: typeof parsed.reason === "string" ? parsed.reason : "", + } + } catch { + return { + score: 0, + reason: "Failed to parse judge response", + } + } +} diff --git a/src/types/checkpoint.ts b/src/types/checkpoint.ts index f8f1180..b3ff572 100644 --- a/src/types/checkpoint.ts +++ b/src/types/checkpoint.ts @@ -70,6 +70,7 @@ export interface EvaluatePhaseCheckpoint { score?: number explanation?: string retrievalMetrics?: RetrievalMetrics + details?: Record startedAt?: string completedAt?: string durationMs?: number diff --git a/src/types/judge.ts b/src/types/judge.ts index cf8bcb3..aef64aa 100644 --- a/src/types/judge.ts +++ b/src/types/judge.ts @@ -20,6 +20,7 @@ export interface JudgeResult { score: number label: "correct" | "incorrect" explanation: string + details?: Record } export interface Judge { diff --git a/src/types/unified.ts b/src/types/unified.ts index e4a0deb..e5ef939 100644 --- a/src/types/unified.ts +++ b/src/types/unified.ts @@ -66,6 +66,7 @@ export interface EvaluationResult { answerDurationMs: number totalDurationMs: number retrievalMetrics?: RetrievalMetrics + details?: Record } export interface LatencyStats { From 110237ae864d9701b01638a2a8afaa843d6b7f80 Mon Sep 17 00:00:00 2001 From: Sreeram Sreedhar Date: Sun, 24 May 2026 00:51:41 -0400 Subject: [PATCH 3/4] fixed date issue --- src/benchmarks/beam/index.ts | 38 +++- src/orchestrator/phases/answer.ts | 21 ++- src/prompts/beam.ts | 276 ++++++++++++++++++++++++------ 3 files changed, 282 insertions(+), 53 deletions(-) diff --git a/src/benchmarks/beam/index.ts b/src/benchmarks/beam/index.ts index 1d70583..6fe6854 100644 --- a/src/benchmarks/beam/index.ts +++ b/src/benchmarks/beam/index.ts @@ -8,6 +8,7 @@ import type { UnifiedSession, } from "../../types/unified" import { logger } from "../../utils/logger" +import { formatBeamDate, parseBeamTimeAnchor } from "../../prompts/beam" import type { BeamBatch, BeamChatFile, BeamProbingQuestionsFile, BeamScale } from "./types" const DEFAULT_DATA_PATH = "./data/benchmarks/beam/chats" @@ -92,7 +93,17 @@ function createGroundTruth(question: unknown): string { if (typeof question === "object" && question !== null) { const record = question as Record const answer = getQuestionAnswer(record) - return answer || JSON.stringify(question) + if (answer) return answer + + // Fall back to the rubric so retrieval-eval gets a useful expected-answer + // signal for types like instruction_following/preference_following that + // describe expected behavior via rubric items instead of a single answer. + const rubric = record.rubric + if (Array.isArray(rubric) && rubric.every((item) => typeof item === "string")) { + return rubric.join("\n") + } + + return JSON.stringify(question) } return JSON.stringify(question) @@ -206,6 +217,24 @@ export class BeamBenchmark implements Benchmark { const sessions: UnifiedSession[] = [] for (const batch of batches) { + // mem0's `get_time_anchor_epoch` finds the earliest non-null `time_anchor` + // across all messages in a batch and tags every memory derived from that + // batch with it. Most turns in BEAM don't carry their own anchor, so + // hoisting the batch-level anchor here gives dates to every session in + // the batch (matching mem0's per-memory dating). + let batchTimeAnchor: string | null | undefined = batch.time_anchor ?? null + if (!batchTimeAnchor) { + for (const turn of batch.turns) { + const msgWithAnchor = turn.find((m) => m.time_anchor) + if (msgWithAnchor?.time_anchor) { + batchTimeAnchor = msgWithAnchor.time_anchor + break + } + } + } + const batchDateIso = parseBeamTimeAnchor(batchTimeAnchor) + const batchDateFormatted = batchDateIso ? formatBeamDate(batchDateIso) : undefined + for (let turnIndex = 0; turnIndex < batch.turns.length; turnIndex++) { const turn = batch.turns[turnIndex] const messages = turn @@ -229,7 +258,12 @@ export class BeamBenchmark implements Benchmark { chatId, batchNumber: batch.batch_number, turnIndex: turnIndex + 1, - timeAnchor: batch.time_anchor, + // Match LocoMo / LongMemEval: `date` (ISO) + `formattedDate` + // (readable). The Supermemory provider reads these fields to + // (a) attach `metadata.date` to the document and (b) prefix the + // ingested content with a natural-language date sentence. + date: batchDateIso, + formattedDate: batchDateFormatted, }, }) } diff --git a/src/orchestrator/phases/answer.ts b/src/orchestrator/phases/answer.ts index d9e6df7..4cfc92a 100644 --- a/src/orchestrator/phases/answer.ts +++ b/src/orchestrator/phases/answer.ts @@ -11,6 +11,7 @@ import { config } from "../../utils/config" import { logger } from "../../utils/logger" import { getModelConfig, ModelConfig, DEFAULT_ANSWERING_MODEL } from "../../utils/models" import { buildDefaultAnswerPrompt } from "../../prompts/defaults" +import { buildBeamAnswerPrompt } from "../../prompts/beam" import { buildContextString } from "../../types/prompts" import { ConcurrentExecutor } from "../concurrent" import { resolveConcurrency } from "../../types/concurrency" @@ -119,8 +120,24 @@ export async function runAnswerPhase( const context: unknown[] = searchData.results || [] const questionDate = checkpoint.questions[question.questionId]?.questionDate - const basePrompt = buildAnswerPrompt(question.question, [], questionDate, provider) - const prompt = buildAnswerPrompt(question.question, context, questionDate, provider) + // BEAM uses mem0's answer-generation prompt verbatim so retrieval + + // judging numbers are apples-to-apples with their published BEAM + // results. The provider-specific prompt is intentionally bypassed. + let basePrompt: string + let prompt: string + if (benchmark.name.startsWith("beam")) { + const sessions = benchmark.getHaystackSessions(question.questionId) + const sessionDateMap = new Map() + for (const s of sessions) { + const date = s.metadata?.date + if (typeof date === "string") sessionDateMap.set(s.sessionId, date) + } + basePrompt = buildBeamAnswerPrompt(question.question, [], sessionDateMap) + prompt = buildBeamAnswerPrompt(question.question, context, sessionDateMap) + } else { + basePrompt = buildAnswerPrompt(question.question, [], questionDate, provider) + prompt = buildAnswerPrompt(question.question, context, questionDate, provider) + } const basePromptTokens = countTokens(basePrompt, modelConfig) const promptTokens = countTokens(prompt, modelConfig) diff --git a/src/prompts/beam.ts b/src/prompts/beam.ts index 625252a..b91f113 100644 --- a/src/prompts/beam.ts +++ b/src/prompts/beam.ts @@ -3,6 +3,157 @@ export interface BeamRubricJudgeResult { reason: string } +/** + * Number of memories to retrieve per BEAM search. Matches mem0's evaluation + * cutoff exactly (their --top-k-cutoffs default) so the answering model sees + * the same context budget. mem0 retrieves 200 and trims to 100; we retrieve + * 100 directly since we evaluate at a single cutoff. + */ +export const BEAM_SEARCH_TOP_K = 100 + +/** + * How many of the retrieved memories to expose to the answering model. + * Mirrors BEAM_SEARCH_TOP_K — every retrieved memory is shown. + */ +export const BEAM_ANSWER_TOP_K = 100 + +const MONTH_NAMES = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", +] + +const MONTH_TO_NUMBER: Record = Object.fromEntries( + MONTH_NAMES.map((name, i) => [name.toLowerCase(), String(i + 1).padStart(2, "0")]) +) + +/** + * Parse BEAM's per-batch time_anchor strings into ISO YYYY-MM-DD. BEAM stores + * these as "Month-DD-YYYY" (e.g. "March-01-2024"); some batches have a null + * anchor, in which case we return undefined and the memory line is rendered + * without a date prefix (matching mem0's "if no created_at" branch). + */ +export function parseBeamTimeAnchor(anchor: unknown): string | undefined { + if (typeof anchor !== "string") return undefined + const m = anchor.match(/^(\w+)-(\d{1,2})-(\d{4})$/) + if (!m) return undefined + const [, monthName, dayStr, year] = m + const month = MONTH_TO_NUMBER[monthName.toLowerCase()] + if (!month) return undefined + return `${year}-${month}-${dayStr.padStart(2, "0")}` +} + +/** + * Human-readable form of a BEAM ISO date, used as `formattedDate` on session + * metadata so the Supermemory provider includes a natural-language date prefix + * in the ingested content (mirrors the LocoMo / LongMemEval pattern). + */ +export function formatBeamDate(iso: string): string { + const [year, month, day] = iso.split("-") + const monthName = MONTH_NAMES[parseInt(month, 10) - 1] + if (!monthName) return iso + return `${monthName} ${parseInt(day, 10)}, ${year}` +} + +interface BeamMemoryLike { + memory?: string + content?: string + metadata?: { sessionId?: string } +} + +/** + * Ported verbatim from mem0's get_beam_answer_generation_prompt + * (mem0ai/memory-benchmarks/benchmarks/beam/prompts.py). Memories are sliced + * to BEAM_ANSWER_TOP_K, sorted chronologically (oldest first), numbered, and + * prefixed with their session date when available — exactly the format mem0's + * answering LLM sees on their published BEAM 1M / 10M numbers. + */ +export function buildBeamAnswerPrompt( + question: string, + memories: unknown[], + sessionDateMap: Map +): string { + const sliced = memories.slice(0, BEAM_ANSWER_TOP_K) as BeamMemoryLike[] + const memoriesText = formatBeamMemories(sliced, sessionDateMap) + return `You are an AI assistant with access to stored memories from prior conversations with a user. +Use these memories to answer the following question as accurately and completely as possible. + +IMPORTANT RULES: +1. Scan ALL provided memories before answering — do not stop after the first relevant one. +2. If multiple memories contain relevant information, combine and cross-reference them. +3. If the memories contain contradictory information, prefer the more recent one. +4. If the memories don't contain enough information to answer, say exactly: "I don't have enough information to answer this question." +5. For temporal questions: pay attention to dates and relative time references. +6. For ordering questions: present events in chronological order. +7. For preference questions: use the most recently stated preference. +8. Be specific and direct — include exact names, dates, numbers, and details from the memories. +9. Do NOT invent or assume information that isn't in the memories. + +QUESTION: ${question} + +RETRIEVED MEMORIES: +${memoriesText} + +ANSWER:` +} + +function formatBeamMemories( + memories: BeamMemoryLike[], + sessionDateMap: Map +): string { + if (memories.length === 0) return "(No memories available)" + + // Resolve text + date per memory. + const items = memories.map((m) => { + const text = + typeof m?.memory === "string" + ? m.memory + : typeof m?.content === "string" + ? m.content + : JSON.stringify(m) + const sessionId = + typeof m?.metadata?.sessionId === "string" ? m.metadata.sessionId : "" + const date = sessionId ? sessionDateMap.get(sessionId) : undefined + return { text, sessionId, date } + }) + + // mem0 sorts by created_at ascending (oldest first). When date is missing + // we fall back to sessionId order, which is itself chronological for BEAM + // since sessionIds encode batch + turn ordinals. + items.sort((a, b) => { + const aKey = a.date || "" + const bKey = b.date || "" + if (aKey !== bKey) return aKey.localeCompare(bKey) + return a.sessionId.localeCompare(b.sessionId, undefined, { numeric: true }) + }) + + return items + .map((item, i) => { + const prefix = item.date ? `[${item.date}] ` : "" + return `${i + 1}. ${prefix}${item.text}` + }) + .join("\n") +} + +/** + * Ported verbatim from mem0's BEAM benchmark setup + * (mem0ai/memory-benchmarks/benchmarks/beam/prompts.py) so our judging is + * apples-to-apples with their published BEAM 1M / 10M numbers. + */ +export const BEAM_JUDGE_SYSTEM_PROMPT = + "You are an expert evaluator assessing whether an AI assistant's response satisfies " + + "specific rubric criteria. You must be objective, fair, and consistent. " + + "Return ONLY valid JSON with the exact format requested." + function parseJsonResponse(response: string): Record { const trimmed = response.trim() @@ -21,69 +172,96 @@ function parseJsonResponse(response: string): Record { return JSON.parse(trimmed) } +/** + * Ported from mem0's get_beam_nugget_judge_prompt. Each rubric "nugget" is + * scored independently on a 3-point scale by the judge LLM. + */ export function buildBeamRubricJudgePrompt( question: string, - rubricItem: string, + nugget: string, llmResponse: string ): string { - return `You are an expert evaluator tasked with judging whether the LLM's response demonstrates compliance with the specified RUBRIC CRITERION. - -## EVALUATION INPUTS -- QUESTION (what the user asked): ${question} -- RUBRIC CRITERION (what to check): ${rubricItem} -- RESPONSE TO EVALUATE: ${llmResponse} - -## EVALUATION RUBRIC: -The rubric defines a specific requirement, constraint, or expected behavior that the LLM response should demonstrate. - -**IMPORTANT**: Pay careful attention to whether the rubric specifies: -- **Positive requirements** (things the response SHOULD include/do) -- **Negative constraints** (things the response SHOULD NOT include/do, often indicated by "no", "not", "avoid", "absent") - -## RESPONSIVENESS REQUIREMENT (anchored to the QUESTION) -A compliant response must be **on-topic with respect to the QUESTION** and attempt to answer it. -- If the response does not address the QUESTION, score **0.0** and stop. -- For negative constraints, both must hold: (a) the response is responsive to the QUESTION, and (b) the prohibited element is absent. - -## SEMANTIC TOLERANCE RULES: -Judge by meaning, not exact wording. -- Accept paraphrases and synonyms that preserve intent. -- Case/punctuation/whitespace differences must be ignored. -- Numbers/currencies/dates may appear in equivalent forms when numerically equivalent. -- If the rubric expects a number or duration, prefer normalized comparison over string matching. - -## STYLE NEUTRALITY: -Ignore tone, politeness, length, and flourish unless the rubric explicitly requires a format/structure. -- Do not penalize hedging, voice, or verbosity if content satisfies the rubric. -- Only evaluate format when the rubric explicitly mandates it. - -## SCORING SCALE: -- 1.0 (Complete Compliance): Fully complies with the rubric criterion. -- 0.5 (Partial Compliance): Partially complies. -- 0.0 (No Compliance): Fails to comply. - -## OUTPUT FORMAT: -Return your evaluation in JSON format with two fields: -{ - "score": [your score: 1.0, 0.5, or 0.0], - "reason": "[detailed explanation of whether the rubric criterion was satisfied and why]" + return `Evaluate whether the following LLM response demonstrates compliance with the specified RUBRIC CRITERION. + +QUESTION: +${question} + +LLM RESPONSE: +${llmResponse} + +RUBRIC CRITERION: +${nugget} + +SCORING GUIDELINES: + +First, determine whether the rubric criterion is a POSITIVE requirement (the response SHOULD include something) or a NEGATIVE constraint (the response SHOULD NOT include something). + +**For POSITIVE requirements** (response should contain, mention, or demonstrate something): +- **1.0 (Complete Compliance)**: The required element is present, accurate, and complete. The response fully and clearly satisfies the rubric criterion. +- **0.5 (Partial Compliance)**: The required element is partially present, has minor inaccuracies, or is incomplete. The core intent is present but not fully realized. +- **0.0 (No Compliance)**: The required element is missing, incorrect, or the response is entirely off-topic / non-responsive. + +**For NEGATIVE constraints** (response should NOT contain or should avoid something): +- **1.0 (Complete Compliance)**: The response is responsive to the question AND the prohibited element is absent. +- **0.5 (Partial Compliance)**: The response is responsive but contains a borderline or ambiguous reference to the prohibited element. +- **0.0 (No Compliance)**: The prohibited element is present in the response, OR the response is non-responsive (off-topic, refusal, empty). + +**Compound statement handling**: If the rubric criterion contains "and" or commas connecting multiple required elements: +- All elements present and correct = 1.0 +- Some (but not all) elements present and correct = 0.5 +- No elements present or correct = 0.0 + +EVALUATION RULES: +1. **Semantic tolerance**: Paraphrases and synonyms are acceptable. The response does not need to use the exact same words as the rubric. +2. **Numeric and date equivalence**: Treat equivalent representations as identical. "$68,000" = "68k" = "sixty-eight thousand dollars". "2 years" = "24 months". Prefer normalized comparison for numbers, currencies, dates, and durations. +3. **Case / punctuation / whitespace tolerance**: Differences in capitalization, punctuation, and whitespace must be ignored when comparing content. +4. **Hedging tolerance**: Do not penalize hedging language ("I think", "probably", "it seems"), passive voice, or verbosity if the substantive content satisfies the rubric criterion. +5. **Style neutrality**: Do not penalize for tone, formatting, or length unless the rubric criterion specifically requires a particular format. +6. **Responsiveness**: If the LLM response is completely off-topic or refuses to answer, score 0.0 for all criteria. +7. **Independence**: Evaluate this criterion in isolation — do not consider other rubric items. +8. **Specificity matters**: Vague or generic answers that could apply to any question score lower than specific, detailed answers. + +STEP-BY-STEP EVALUATION: +Follow these steps in order: +1. **Understand the Requirement**: Read the rubric criterion and classify it as a positive requirement or a negative constraint. +2. **Parse Compound Statements**: If the criterion contains multiple sub-requirements joined by "and" or commas, identify each element separately. +3. **Check Compliance**: Compare the LLM response against each element, applying the tolerance rules above (semantic, numeric, case, hedging). +4. **Assign Score**: Use the appropriate scoring table (positive or negative) and compound-statement rule to determine the score. +5. **Provide Reasoning**: Write a concise explanation referencing which elements were or were not satisfied. + +Return your evaluation as a JSON object with exactly two fields: +{"score": <0.0 or 0.5 or 1.0>, "reason": ""}` } -NOTE: ONLY output the JSON object, without any explanation before or after that` +/** + * Ported from mem0's _clamp_nugget_score. Snaps any numeric score the judge + * returns to the nearest of {0.0, 0.5, 1.0}, instead of penalizing on exact + * match failure. + */ +export function clampNuggetScore(raw: number): 0 | 0.5 | 1 { + if (!Number.isFinite(raw)) return 0 + if (raw >= 0.75) return 1 + if (raw >= 0.25) return 0.5 + return 0 } +/** + * Ported from mem0's judge_single_nugget response handling. If JSON parsing + * works, clamp the score; otherwise fall back to scanning the raw text for + * "1.0" / "0.5" markers. + */ export function parseBeamRubricJudgeResponse(response: string): BeamRubricJudgeResult { try { const parsed = parseJsonResponse(response) - const score = typeof parsed.score === "number" ? parsed.score : Number(parsed.score) + const raw = typeof parsed.score === "number" ? parsed.score : Number(parsed.score) return { - score: Number.isFinite(score) ? score : 0, + score: clampNuggetScore(raw), reason: typeof parsed.reason === "string" ? parsed.reason : "", } } catch { - return { - score: 0, - reason: "Failed to parse judge response", - } + const snippet = response.slice(0, 200) + if (snippet.includes("1.0")) return { score: 1, reason: snippet } + if (snippet.includes("0.5")) return { score: 0.5, reason: snippet } + return { score: 0, reason: `Parse error: ${snippet}` } } } From 7064491d353ea343a45bd1b2f42badfd11b52ddf Mon Sep 17 00:00:00 2001 From: Sreeram Sreedhar Date: Sun, 24 May 2026 01:15:50 -0400 Subject: [PATCH 4/4] add BEAM to CLI help and benchmark README --- README.md | 2 +- src/benchmarks/README.md | 15 +++++++++++++++ src/cli/index.ts | 7 +++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ef60b67..9b0ac51 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ GOOGLE_API_KEY= ``` -p, --provider Memory provider (supermemory, mem0, zep) --b, --benchmark Benchmark (locomo, longmemeval, convomem) +-b, --benchmark Benchmark (locomo, longmemeval, convomem, beam-1m, beam-10m) -j, --judge Judge model (gpt-4o, sonnet-4, gemini-2.5-flash, etc.) -r, --run-id Run identifier (auto-generated if omitted) -m, --answering-model Model for answer generation (default: gpt-4o) diff --git a/src/benchmarks/README.md b/src/benchmarks/README.md index 8d43cf9..e6b8dbc 100644 --- a/src/benchmarks/README.md +++ b/src/benchmarks/README.md @@ -36,6 +36,7 @@ interface Benchmark { | `locomo` | GitHub snap-research/locomo | Long context memory benchmark | | `longmemeval` | HuggingFace xiaowu0162/longmemeval-cleaned | Long-term memory evaluation | | `convomem` | HuggingFace Salesforce/ConvoMem | Conversational memory benchmark | +| `beam-1m` / `beam-10m` | HuggingFace Mohammadta/BEAM | Beyond a Million Tokens benchmark (1M and 10M token tiers) | ## Question Types @@ -67,3 +68,17 @@ interface Benchmark { | `changing_evidence` | change | Information updates | | `implicit_connection_evidence` | implicit | Implicit reasoning | | `abstention_evidence` | abstain | Unanswerable questions | + +### BEAM +| Type | Alias | Description | +|------|-------|-------------| +| `abstention` | abstain | Withhold answers when evidence is missing | +| `contradiction_resolution` | contradict | Detect and reconcile inconsistent statements | +| `event_ordering` | order | Reconstruct event or information order | +| `information_extraction` | extract | Recall entities and factual details | +| `instruction_following` | instruction | Follow sustained user instructions | +| `knowledge_update` | update | Retain updated facts over stale facts | +| `multi_session_reasoning` | multi | Reason across non-adjacent dialogue segments | +| `preference_following` | preference | Adapt to evolving user preferences | +| `summarization` | summary | Summarize dialogue content | +| `temporal_reasoning` | temporal | Reason about explicit and implicit time relations | diff --git a/src/cli/index.ts b/src/cli/index.ts index b3c29d6..e6ae3db 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -151,10 +151,17 @@ Available benchmark datasets for evaluation: Tests: user facts, assistant facts, preferences, implicit connections Source: HuggingFace Salesforce/ConvoMem (downloaded on first use) + beam BEAM - Beyond a Million Tokens benchmark + Tests: abstention, contradiction, event ordering, extraction, instructions, knowledge update, multi-session, preferences, summarization, temporal + Source: HuggingFace Mohammadta/BEAM (downloaded on first use) + Scales: beam-1m (700 q / 35 chats), beam-10m (200 q / 10 chats) + Usage: -b locomo Run LoCoMo benchmark -b longmemeval Run LongMemEval benchmark -b convomem Run ConvoMem benchmark + -b beam-1m Run BEAM 1M-token tier + -b beam-10m Run BEAM 10M-token tier `) }