XortexAI · ved015 · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
@@ -66,6 +66,24 @@ benchmarks/longmemeval/**/*.pyc
 benchmarks/longmemeval/data/
 benchmarks/longmemeval/results/
 benchmarks/longmemeval/outputs/
+!benchmarks/common/
+!benchmarks/common/**
+benchmarks/common/**/__pycache__/
+benchmarks/common/**/*.pyc
+!benchmarks/locomo/
+!benchmarks/locomo/**
+benchmarks/locomo/**/__pycache__/
+benchmarks/locomo/**/*.pyc
+benchmarks/locomo/data/
+benchmarks/locomo/results/
+benchmarks/locomo/outputs/
+!benchmarks/beam/
+!benchmarks/beam/**
+benchmarks/beam/**/__pycache__/
+benchmarks/beam/**/*.pyc
+benchmarks/beam/data/
+benchmarks/beam/results/
+benchmarks/beam/outputs/
 backboard/
 rust/
 

@@ -2,5 +2,6 @@
 
 ## Unreleased
 
+- Add modular LoCoMo and BEAM benchmark runners for the Python XMem API.
 - Add local XMem setup through `npx create-xmem@latest` and `npm run dev`.
 - Add local Docker storage, Chrome extension build patching, diagnostics, verification, and context export/import/sync commands.
@@ -3,7 +3,9 @@
 This directory contains benchmark harnesses for XMem.
 
 - `longmemeval/`: Python-only LongMemEval benchmark runner targeting the XMem HTTP API.
+- `locomo/`: Python-only LoCoMo benchmark runner targeting the XMem HTTP API.
+- `beam/`: Python-only BEAM runner, defaulting to the Hugging Face BEAM 1M split.
 
 Benchmark runs can create large dataset and result artifacts. Keep those files under
-`benchmarks/longmemeval/data`, `benchmarks/longmemeval/results`, or
-`benchmarks/longmemeval/outputs`; those paths are intentionally ignored by git.
+each benchmark's `data`, `results`, or `outputs` directory; those paths are
+intentionally ignored by git.
@@ -0,0 +1,86 @@
+# BEAM 1M Benchmark for XMem Python
+
+This harness benchmarks the Python XMem API on the BEAM dataset, defaulting to
+the `1M` split from `Mohammadta/BEAM` on Hugging Face. It does not run or
+compare the Go implementation.
+
+BEAM rows contain long `chat` histories and stringified `probing_questions`.
+The dataset card lists ten memory ability types: abstention, contradiction
+resolution, event ordering, information extraction, instruction following,
+knowledge update, multi-session reasoning, preference following, summarization,
+and temporal reasoning.
+
+## Dependencies
+
+BEAM is distributed as parquet. Install `pyarrow` before reading the downloaded
+dataset:
+
+```bash
+pip install pyarrow
+```
+
+## Smoke Check
+
+```bash
+python -m benchmarks.beam.run \
+  --split 1M \
+  --download \
+  --dry-run \
+  --limit 1
+```
+
+This downloads the BEAM 1M parquet file, validates parsing, counts ingest items,
+and does not call XMem.
+
+## Run Against XMem
+
+```bash
+export XMEM_API_KEY="..."
+
+python -m benchmarks.beam.run \
+  --split 1M \
+  --dataset-path benchmarks/beam/data/1M-00000-of-00001.parquet \
+  --api-base-url https://api.xmem.in \
+  --output-dir benchmarks/beam/results/beam-1m
+```
+
+To run a balanced slice, sample an equal percentage from each BEAM question
+type:
+
+```bash
+python -m benchmarks.beam.run \
+  --split 1M \
+  --dataset-path benchmarks/beam/data/1M-00000-of-00001.parquet \
+  --sample-percent-per-question-type 1 \
+  --api-base-url https://api.xmem.in \
+  --output-dir benchmarks/beam/results/beam-1m-1pct
+```
+
+Outputs:
+
+- `results.jsonl`: full per-question records with local proxy metrics
+- `predictions.jsonl`: `question_id` and `hypothesis`
+- `summary.json`: local exact/contains/token-F1 grouped by BEAM question type
+
+## Judge Evaluation
+
+The benchmark runner writes the BEAM rubric for each question into
+`results.jsonl`. To compute BEAM-style pass rate and average judge score with an
+OpenAI judge model:
+
+```bash
+export OPENAI_API_KEY="..."
+
+python -m benchmarks.beam.evaluate \
+  --results-path benchmarks/beam/results/beam-1m-1pct/results.jsonl \
+  --output-dir benchmarks/beam/results/beam-1m-1pct
+```
+
+This writes:
+
+- `evaluations.jsonl`: per-question rubric judge scores and reasons
+- `evaluation_summary.json`: pass rate and average judge score overall and by
+  question type
+
+The pass threshold is `judge_score >= 0.5`, matching the usual BEAM pass-rate
+interpretation over rubric scores.
@@ -0,0 +1 @@
+"""BEAM benchmark harness for the Python XMem API."""
@@ -0,0 +1,66 @@
+"""Configuration for the BEAM benchmark harness."""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+
+DEFAULT_API_BASE_URL = "https://api.xmem.in"
+DEFAULT_API_KEY_ENV = "XMEM_API_KEY"
+DEFAULT_SPLIT = "1M"
+DEFAULT_DATASET_URLS = {
+    "100K": (
+        "https://huggingface.co/datasets/Mohammadta/BEAM/resolve/main/"
+        "data/100K-00000-of-00001.parquet"
+    ),
+    "500K": (
+        "https://huggingface.co/datasets/Mohammadta/BEAM/resolve/main/"
+        "data/500K-00000-of-00001.parquet"
+    ),
+    "1M": (
+        "https://huggingface.co/datasets/Mohammadta/BEAM/resolve/main/"
+        "data/1M-00000-of-00001.parquet"
+    ),
+}
+
+
+@dataclass(frozen=True)
+class BenchmarkConfig:
+    dataset_path: Path
+    output_dir: Path
+    api_base_url: str = DEFAULT_API_BASE_URL
+    api_key_env: str = DEFAULT_API_KEY_ENV
+    api_timeout_seconds: float = 120.0
+    max_retries: int = 3
+    retry_backoff_seconds: float = 2.0
+    batch_size: int = 25
+    ingest_api_version: str = "v2"
+    poll_interval_seconds: float = 2.0
+    poll_timeout_seconds: float = 1800.0
+    top_k: int = 10
+    effort_level: str = "low"
+    user_prefix: str = "beam"
+    limit: int | None = None
+    offset: int = 0
+    question_type: str | None = None
+    sample_percent_per_question_type: float | None = None
+    sample_min_per_question_type: int = 1
+    sample_seed: int = 13
+    split: str = DEFAULT_SPLIT
+    skip_ingest: bool = False
+    resume: bool = True
+    dry_run: bool = False
+
+    @property
+    def api_key(self) -> str:
+        return os.getenv(self.api_key_env, "").strip()
+
+    def require_api_key(self) -> str:
+        api_key = self.api_key
+        if not api_key:
+            raise RuntimeError(
+                f"Missing API key. Set {self.api_key_env} before running BEAM."
+            )
+        return api_key
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""BEAM benchmark harness for the Python XMem API."""