diff --git a/Makefile b/Makefile
index af2ad664..bc47f551 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ ENTERPRISE_USE_CASES_CORPUS ?= evals/datasets/enterprise_use_cases_corpus.jsonl
CAREER_EVAL_K_VALUES ?= 3,5,10
CAREER_EVAL_OUTPUT_DIR ?= evals/runs
-.PHONY: help install test lint lint-ci eval eval-career baseline seed-golden build-job-ads-corpus build-enterprise-use-cases-corpus docs-openapi dev dev-stop dev-status dev-backend dev-frontend
+.PHONY: help install test lint lint-ci eval eval-career eval-figure-chat baseline seed-golden build-job-ads-corpus build-enterprise-use-cases-corpus docs-openapi dev dev-stop dev-status dev-backend dev-frontend
help:
@echo "Available targets:"
@@ -17,6 +17,7 @@ help:
@echo " make lint-ci - Run Ruff lint checks (strict)"
@echo " make eval - Run full eval set (requires DATASET)"
@echo " make eval-career - Run offline career recommendation metrics from impression/feedback logs"
+ @echo " make eval-figure-chat - Verify figure-derived knowledge appears in chat answers"
@echo " make baseline - Run full eval and persist results under evals/runs/"
@echo " make seed-golden - Seed/update golden eval dataset"
@echo " make build-job-ads-corpus - Build canonical enterprise job ads corpus JSONL"
@@ -74,6 +75,9 @@ eval-career:
--k-values "$(CAREER_EVAL_K_VALUES)" \
--output-dir "$(CAREER_EVAL_OUTPUT_DIR)"
+eval-figure-chat:
+ PYTHONPATH=. $(PYTHON) scripts/eval_figure_chat_answers.py
+
baseline:
@if [ ! -f "$(DATASET)" ]; then \
echo "Dataset not found: $(DATASET)"; \
diff --git a/docs/architecture.md b/docs/architecture.md
index 89470c3a..8e12e040 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -1,6 +1,19 @@
# Architecture
-Last updated: 2026-03-12
+Last updated: 2026-03-14
+
+## Two-Layer Architecture
+
+The system derives two complementary layers from the same ingested corpus:
+
+1. **Retrieval layer** — sources → chunks → embeddings (ChromaDB + FTS5) → hybrid retrieval → chat
+2. **Knowledge layer** — sources → learnings → concepts → atoms → dossiers → curriculum
+
+The retrieval layer serves recall (finding relevant passages).
+The knowledge layer serves understanding (structured definitions, formulas, prerequisites, insights).
+
+Both layers are read together at query time: the dossier provides the knowledge substrate,
+retrieved chunks provide supporting evidence.
## System Overview
@@ -66,6 +79,23 @@ Content flows through three main paths:
**Ingest** — A URL (YouTube video, article, PDF) enters the pipeline. The system fetches the content, cleans it, chunks it, assesses relevance and quality, extracts learnings (skills, concepts, tools) via LLM, stores chunks in ChromaDB + FTS5, and saves structured learnings to SQLite. The pipeline runs on-demand from the UI or on a daily cron schedule.
+For PDFs, figures and diagrams are detected and described before chunking:
+
+```mermaid
+flowchart LR
+ PDF --> OCR["Mistral OCR"]
+ OCR --> Text["Page text"]
+ OCR --> Imgs["Page images"]
+ Imgs --> Filter["Filter >2% page area"]
+ Filter --> Vision["Gemini 2.5 Flash"]
+ Vision --> Desc["Figure descriptions"]
+ Text --> Merge["Merged text"]
+ Desc --> Merge
+ Merge --> Chunk["Chunk + embed + store"]
+```
+
+This captures chart values, method names, and diagram relationships as retrievable text. No `GEMINI_API_KEY` = figures silently skipped, OCR text still flows through.
+
**Retrieve + Chat** — When a user asks a question, the router classifies intent into one of six buckets: `clear_meta`, `ambiguous`, `general`, `content`, `web_search`, or `skill`. Content questions pass through a **query rewriter** that resolves multi-turn follow-ups ("tell me more about it") into standalone queries ("Python usage by Nestlé data teams") using conversation history. The rewritten query drives both hybrid retrieval (vector + BM25 + Cohere rerank) and final answer generation — ensuring the LLM stays anchored to the specific topic, not the general source. Answers stream back via SSE with explicit provenance (`answer_origin`: `library_rag`, `web_rag`, `skill`, `policy`, or `general`).
**Learn** — When learnings are confirmed, the system canonicalizes concepts and triggers background atom extraction. An LLM extracts structured knowledge atoms (definitions, formulas with LaTeX, examples, prerequisite claims, key insights) from source text, grounds each atom by verifying its context appears verbatim in the source, and aggregates atoms across sources into a concept dossier. When a user requests an explanation, the dossier serves as the primary structured input ("knowledge substrate") while retrieved chunks provide supporting context. Teaching profiles (7 levels from primary school to advanced professional) and domain lenses (9 disciplines) adapt the explanation pedagogy and framing.
diff --git a/docs/architecture/ingestion.md b/docs/architecture/ingestion.md
index bf21a3d2..5a21ce1b 100644
--- a/docs/architecture/ingestion.md
+++ b/docs/architecture/ingestion.md
@@ -14,6 +14,20 @@ Key models per step:
- **embed**: text-embedding-3-small (OpenAI)
- **contextualize**: claude-haiku-4-5 (chunk context headers)
+### PDF Figure Descriptions
+
+PDFs are processed with Mistral OCR (`include_image_base64=True`). Significant figures (>2% of page area) are described by Gemini 2.5 Flash in parallel. Each description is appended to the page text before chunking:
+
+```
+Figure (page 8): Recall Performance of Various Search Models
+
+This bar chart compares six retrieval methods across Recall@1,
+Recall@5, and Recall@20. Multimodal Hybrid Search achieves the
+highest Recall@20 at approximately 0.95.
+```
+
+This makes chart values, method names, and diagram relationships retrievable as chunk content. Descriptions are generated at ingest time — no changes to embeddings, retrieval, or downstream extraction. If `GEMINI_API_KEY` is not set, figure descriptions are silently skipped.
+
## Post-Confirmation: Atom Extraction
After a user confirms learnings (`confirm_learnings()`), two additional steps run:
diff --git a/docs/critical-path.md b/docs/critical-path.md
index 1d55fdc6..b1735620 100644
--- a/docs/critical-path.md
+++ b/docs/critical-path.md
@@ -23,6 +23,10 @@ Source: `backend/services/ingest/pipeline.py`
```
1. fetch_youtube() [Supadata API] / fetch_article() [Firecrawl] / fetch_file()
→ backend/services/ingest/steps/fetch.py
+ For PDFs: extract_text_from_pdf() uses Mistral OCR with include_image_base64=True.
+ Significant figures (>2% page area) are described by Gemini 2.5 Flash in parallel.
+ Descriptions are appended as "Figure (page N):
\n\n" blocks.
+ → file_ingest.py
2. save_transcript_raw() + download_youtube_thumbnail()
→ backend/services/ingest/steps/persist_assets.py
@@ -301,6 +305,8 @@ GET /api/career/recommend/jobs
| Chunk contextualization | claude-haiku-4-5 | Anthropic |
| Atom extraction (learning) | claude-haiku-4-5 | Anthropic |
| Extended thinking (projects) | claude-opus-4-6 | Anthropic |
+| PDF OCR | mistral-ocr-latest | Mistral |
+| PDF figure descriptions | gemini-2.5-flash | Google |
## Key Thresholds
diff --git a/docs/engineering.md b/docs/engineering.md
index 8225e72a..08ecd343 100644
--- a/docs/engineering.md
+++ b/docs/engineering.md
@@ -201,7 +201,7 @@ Supported ingestion paths:
- `python file_ingest.py ...`
- `python dropbox_courses.py ...`
-PDF handling in current runtime is driven through file/upload ingestion with OCR support.
+PDF handling uses Mistral OCR with figure detection. Significant figures (>2% page area) are described by Gemini 2.5 Flash and inserted into the text stream before chunking, making chart values and diagram relationships retrievable.
### Ingestion invariants
@@ -255,6 +255,8 @@ We currently use approach 1 informally. The required sources are not yet scripte
**Multi-turn test cases** (prefixed `mt_`) additionally require conversation state. The eval runner sends `prior_turns` before the final `user_input` to build context, then evaluates only the final response.
+**Figure-dependent test cases** (prefixed `tc_fig_`, tagged `figure`) query content extracted from PDF figures (charts, diagrams). These require the source PDF to be ingested with figure descriptions enabled. A separate retrieval-only eval (`scripts/eval_figure_retrieval.py`) measures Hit@5 and MRR for figure chunks specifically.
+
References:
- [Building a Golden Dataset for AI Evaluation (Maxim)](https://www.getmaxim.ai/articles/building-a-golden-dataset-for-ai-evaluation-a-step-by-step-guide/)
- [The Path to a Golden Dataset (Microsoft)](https://medium.com/data-science-at-microsoft/the-path-to-a-golden-dataset-or-how-to-evaluate-your-rag-045e23d1f13f)
diff --git a/evals/datasets/golden_dataset.jsonl b/evals/datasets/golden_dataset.jsonl
index 7ed55571..95378c29 100644
--- a/evals/datasets/golden_dataset.jsonl
+++ b/evals/datasets/golden_dataset.jsonl
@@ -1,4 +1,4 @@
-{"_version": "2.0.0", "_count": 15}
+{"_version": "2.1.0", "_count": 18}
{"id": "tc_001", "user_input": "What workflow patterns does Anthropic recommend for building AI agents?", "reference": "Anthropic identifies five key workflow patterns: prompt chaining (sequential LLM steps with gates), routing (classify input to specialized prompts), parallelization (sectioning or voting), orchestrator-workers (central LLM delegates to workers), and evaluator-optimizer (generate then evaluate in a loop). They recommend starting with the simplest solution and only adding complexity when needed.", "reference_contexts": ["Prompt chaining decomposes a task into a sequence of steps, where each LLM call processes the output of the previous one.", "Routing classifies an input and directs it to a specialized followup task. This workflow allows for separation of concerns, and building more specialized prompts.", "In the orchestrator-workers workflow, a central LLM dynamically breaks down tasks, delegates them to worker LLMs, and synthesizes their results.", "In the evaluator-optimizer workflow, one LLM call generates a response while another provides evaluation and feedback in a loop."], "reference_source_ids": ["article_ffc229d83891", "article_ffc229d83891", "article_ffc229d83891", "article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "easy", "created_at": "2026-03-06T11:00:00", "tags": ["curated"]}
{"id": "tc_002", "user_input": "When should you use agents versus simpler LLM solutions?", "reference": "Anthropic recommends finding the simplest solution possible and only increasing complexity when needed. Agentic systems trade latency and cost for better task performance. You should consider when this tradeoff is acceptable. Agents are best for tasks requiring complex reasoning, planning, tool use, and recovery from errors. For simpler tasks, workflows or even single LLM calls are preferable.", "reference_contexts": ["When building applications with LLMs, we recommend finding the simplest solution possible, and only increasing complexity when needed. This might mean not building agentic systems at all. Agentic systems often trade latency and cost for better task performance", "Agents are emerging in production as LLMs mature in key capabilities—understanding complex inputs, engaging in reasoning and planning, using tools reliably, and recovering from errors."], "reference_source_ids": ["article_ffc229d83891", "article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "easy", "created_at": "2026-03-06T11:00:00", "tags": ["curated"]}
{"id": "tc_003", "user_input": "What is prompt chaining and when is it useful?", "reference": "Prompt chaining decomposes a task into a sequence of steps where each LLM call processes the output of the previous one. You can add programmatic checks (gates) on intermediate steps to ensure the process stays on track. It is useful when a task can be cleanly decomposed into fixed subtasks, trading latency for higher accuracy by making each LLM call simpler and more focused.", "reference_contexts": ["Prompt chaining decomposes a task into a sequence of steps, where each LLM call processes the output of the previous one. You can add programmatic checks (see \"gate\" in the diagram below) on any intermediate steps to ensure that the process is still on track."], "reference_source_ids": ["article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "easy", "created_at": "2026-03-06T11:00:00", "tags": ["curated"]}
@@ -14,3 +14,6 @@
{"id": "mt_001", "user_input": "tell me more about the routing one", "prior_turns": ["What workflow patterns does Anthropic describe for AI agents?"], "reference": "Routing classifies an input and directs it to a specialized followup task. This allows separation of concerns and building more specialized prompts. Without routing, optimizing for one kind of input can hurt performance on other inputs. It is useful when there are distinct categories of input that need different handling.", "reference_contexts": ["Routing classifies an input and directs it to a specialized followup task. This workflow allows for separation of concerns, and building more specialized prompts. Without this workflow, optimizing for one kind of input can hurt performance on other inputs."], "reference_source_ids": ["article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "medium", "created_at": "2026-03-06T11:00:00", "tags": ["multi_turn", "curated"]}
{"id": "mt_002", "user_input": "how does that compare to using BM25 alone?", "prior_turns": ["What is Contextual Retrieval?"], "reference": "Contextual Retrieval with embeddings alone reduces failed retrievals by 35% compared to standard embeddings. Adding Contextual BM25 on top of contextual embeddings reduces failures by 49%. BM25 alone finds matches based on exact terms using TF-IDF, but misses semantic similarity. Combining contextual embeddings with contextual BM25 via rank fusion gives much better coverage than either alone.", "reference_contexts": ["This method can reduce the number of failed retrievals by 49% and, when combined with reranking", "BM25 works by building upon the TF-IDF (Term Frequency-Inverse Document Frequency) concept."], "reference_source_ids": ["article_33d811167435", "article_33d811167435"], "namespace": "All", "category": "explanatory", "difficulty": "medium", "created_at": "2026-03-06T11:00:00", "tags": ["multi_turn", "curated"]}
{"id": "mt_003", "user_input": "what about the cost?", "prior_turns": ["How does Contextual Retrieval work?", "Does reranking help?"], "reference": "Contextual Retrieval is cost-effective thanks to Claude's prompt caching - you don't need to pass in the reference document for every chunk. For reranking, there is a small latency and cost impact since it adds an extra step at runtime, but the reranker scores all chunks in a single pass. The overall cost-performance tradeoff is favorable given the significant reduction in retrieval failures.", "reference_contexts": ["With prompt caching, you don't need to pass in the reference document for every chunk.", "One important consideration with reranking is the impact on latency and cost, especially when reranking a large number of chunks. Because reranking adds an extra step at runtime, it inevitably adds a small amount of latency, even though the reranker scores all chunks in a single pass"], "reference_source_ids": ["article_33d811167435", "article_33d811167435"], "namespace": "All", "category": "explanatory", "difficulty": "hard", "created_at": "2026-03-06T11:00:00", "tags": ["multi_turn", "curated"]}
+{"id": "tc_fig_001", "user_input": "What recall@20 does multimodal hybrid search achieve in the IRPAPERS benchmark?", "reference": "Multimodal Hybrid Search achieves approximately 0.95 Recall@20 in the IRPAPERS benchmark, outperforming all other methods including ColModernVBERT (0.93), Hybrid Text Search (0.91), and BM25 (0.90).", "reference_contexts": ["This bar chart illustrates the Recall performance of six different search models across three evaluation points: Recall @ 1, Recall @ 5, and Recall @ 20. The models evaluated are ColModernVBERT, ColModernVBERT with MUVERA, Arctic 2.0, BM25, Hybrid Text Search, and Multimodal Hybrid Search. At Recall @ 20, Multimodal Hybrid Search demonstrates the best performance with a recall of 0.95."], "reference_source_ids": ["doc_2b8c2a754395"], "namespace": "All", "category": "factoid", "difficulty": "medium", "created_at": "2026-03-14T20:00:00", "tags": ["figure", "curated"]}
+{"id": "tc_fig_002", "user_input": "Which retrieval method performs best at Recall@1 in the IRPAPERS benchmark?", "reference": "Multimodal Hybrid Search performs best at Recall@1 with a score of 0.49, outperforming Hybrid Text Search (0.46), BM25 (0.45), Arctic 2.0 (0.44), ColModernVBERT (0.43), and ColModernVBERT with MUVERA (0.41).", "reference_contexts": ["At Recall @ 1, the Multimodal Hybrid Search achieves the highest recall of 0.49, outperforming ColModernVBERT (0.43) and ColModernVBERT with MUVERA (0.41)."], "reference_source_ids": ["doc_2b8c2a754395"], "namespace": "All", "category": "factoid", "difficulty": "medium", "created_at": "2026-03-14T20:00:00", "tags": ["figure", "curated"]}
+{"id": "tc_fig_003", "user_input": "What components are included in the multimodal retrieval pipeline shown in the IRPAPERS paper?", "reference": "The IRPAPERS multimodal retrieval pipeline processes papers through three pathways: page images are converted to multi-vector image embeddings, OCR transcription feeds into BM25 for lexical retrieval, and OCR transcription is also used to create single-vector text embeddings. All three signals are fused in a final retrieval and QA analysis stage.", "reference_contexts": ["This figure illustrates a system for retrieval and question-answering (QA) analysis that leverages multiple data representations. The process begins with IRPAPERS, which are processed into both Page Image and OCR Transcription formats. The Page Image is then transformed into Multi-Vector Image Embeddings. Concurrently, the OCR Transcription feeds into two distinct text embedding methods: BM25 and Single-Vector Text Embeddings."], "reference_source_ids": ["doc_2b8c2a754395"], "namespace": "All", "category": "explanatory", "difficulty": "medium", "created_at": "2026-03-14T20:00:00", "tags": ["figure", "curated"]}
diff --git a/file_ingest.py b/file_ingest.py
index 6b068d54..bdc0dbbe 100644
--- a/file_ingest.py
+++ b/file_ingest.py
@@ -10,10 +10,13 @@
import argparse
import asyncio
+import base64
import hashlib
+import logging
import os
import re
import sqlite3
+from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from dotenv import load_dotenv
@@ -36,6 +39,19 @@
SUPPORTED_EXTENSIONS = {".pdf", ".md", ".markdown", ".txt", ".text"}
+logger = logging.getLogger(__name__)
+
+# --- PDF figure extraction constants ---
+_MIN_FIGURE_AREA_RATIO = 0.02 # Minimum 2% of page area to count as a figure
+_MAX_FIGURES_PER_PDF = 30 # Safety cap
+_FIGURE_VISION_TIMEOUT_S = 10 # Per-image timeout for vision calls
+_FIGURE_DESCRIPTION_PROMPT = (
+ "Describe this figure in a short technical paragraph (3-5 sentences). "
+ "Include method/model names, result values, metric names, axis labels, "
+ "and key comparisons if visible. Omit raw index numbers and vector values. "
+ "Start with a short title on the first line."
+)
+
def compute_file_hash(filepath: Path) -> str:
"""Compute SHA256 hash of file contents."""
@@ -87,8 +103,134 @@ def save_hash_store(namespace: str, hashes: dict[str, str]) -> None:
conn.close()
-def _pages_to_markdown_text(ocr_response: object) -> str:
- """Convert Mistral OCR response pages into plain markdown text."""
+def _describe_figure(
+ image_base64: str, page_num: int, image_id: str,
+) -> tuple[int, str, str | None]:
+ """Describe a single figure using Gemini Vision.
+
+ Returns (page_num, image_id, description) or
+ (page_num, image_id, None) on failure.
+ """
+ gemini_key = os.getenv("GEMINI_API_KEY", "").strip()
+ if not gemini_key:
+ return page_num, image_id, None
+
+ try:
+ from google import genai
+ from google.genai import types
+
+ b64_data = image_base64
+ if b64_data.startswith("data:"):
+ b64_data = b64_data.split(",", 1)[1]
+
+ image_bytes = base64.b64decode(b64_data)
+ mime = (
+ "image/jpeg"
+ if image_bytes[:2] == b"\xff\xd8"
+ else "image/png"
+ )
+
+ client = genai.Client(api_key=gemini_key)
+ response = client.models.generate_content(
+ model="gemini-2.5-flash",
+ contents=types.Content(
+ role="user",
+ parts=[
+ types.Part.from_bytes(
+ data=image_bytes, mime_type=mime,
+ ),
+ types.Part(text=_FIGURE_DESCRIPTION_PROMPT),
+ ],
+ ),
+ config=types.GenerateContentConfig(
+ max_output_tokens=1024,
+ temperature=0.1,
+ thinking_config=types.ThinkingConfig(
+ thinking_budget=0,
+ ),
+ ),
+ )
+ return page_num, image_id, response.text.strip()
+ except Exception as e:
+ logger.warning(
+ "Figure description failed for %s page %d: %s",
+ image_id, page_num, e,
+ )
+ return page_num, image_id, None
+
+
+def _extract_significant_images(pages: list) -> list[tuple[int, str, str]]:
+ """Extract significant images (figures, not icons) from OCR pages.
+
+ Returns list of (page_index, image_id, base64_data).
+ """
+ images = []
+ for page in pages:
+ page_index = getattr(page, "index", None)
+ if page_index is None and isinstance(page, dict):
+ page_index = page.get("index", 0)
+
+ dims = getattr(page, "dimensions", None)
+ if dims is None and isinstance(page, dict):
+ dims = page.get("dimensions")
+ page_w = getattr(dims, "width", 1) if dims else 1
+ page_h = getattr(dims, "height", 1) if dims else 1
+ page_area = page_w * page_h
+
+ page_images = getattr(page, "images", [])
+ if page_images is None:
+ page_images = []
+ if isinstance(page, dict):
+ page_images = page.get("images", [])
+
+ for img in page_images:
+ b64 = getattr(img, "image_base64", None)
+ if b64 is None and isinstance(img, dict):
+ b64 = img.get("image_base64")
+ if not b64:
+ continue
+
+ img_id = getattr(img, "id", "unknown")
+ if isinstance(img, dict):
+ img_id = img.get("id", "unknown")
+
+ # Compute area ratio for size filtering
+ def _coord(attr):
+ if isinstance(img, dict):
+ return img.get(attr)
+ return getattr(img, attr, None)
+
+ tlx = _coord("top_left_x")
+ brx = _coord("bottom_right_x")
+ tly = _coord("top_left_y")
+ bry = _coord("bottom_right_y")
+
+ has_coords = all(
+ c is not None for c in (tlx, brx, tly, bry)
+ )
+ if has_coords:
+ img_area = abs(brx - tlx) * abs(bry - tly)
+ area_ratio = (
+ img_area / page_area if page_area > 0 else 0
+ )
+ else:
+ area_ratio = 1.0 # Assume significant
+
+ if area_ratio >= _MIN_FIGURE_AREA_RATIO:
+ images.append((page_index, img_id, b64))
+
+ if len(images) >= _MAX_FIGURES_PER_PDF:
+ return images
+
+ return images
+
+
+def _pages_to_markdown_text(
+ ocr_response: object, describe_figures: bool = True,
+) -> str:
+ """Convert Mistral OCR response pages into markdown
+ text with figure descriptions.
+ """
pages = getattr(ocr_response, "pages", None)
if pages is None and isinstance(ocr_response, dict):
pages = ocr_response.get("pages")
@@ -96,6 +238,53 @@ def _pages_to_markdown_text(ocr_response: object) -> str:
if not pages:
return ""
+ figure_descriptions: dict[int, list[str]] = {}
+
+ if describe_figures:
+ significant_images = _extract_significant_images(pages)
+ if significant_images:
+ logger.info(
+ "Found %d significant figures, describing",
+ len(significant_images),
+ )
+ total_timeout = (
+ _FIGURE_VISION_TIMEOUT_S * len(significant_images)
+ )
+
+ with ThreadPoolExecutor(max_workers=4) as executor:
+ futures = {
+ executor.submit(
+ _describe_figure, b64, page_idx, img_id,
+ ): (page_idx, img_id)
+ for page_idx, img_id, b64 in significant_images
+ }
+ for future in as_completed(
+ futures, timeout=total_timeout,
+ ):
+ try:
+ page_idx, img_id, desc = future.result(
+ timeout=_FIGURE_VISION_TIMEOUT_S,
+ )
+ if desc:
+ figure_descriptions.setdefault(
+ page_idx, [],
+ ).append(desc)
+ except Exception as e:
+ page_idx, img_id = futures[future]
+ logger.warning(
+ "Figure description timed out "
+ "for %s: %s", img_id, e,
+ )
+
+ described = sum(
+ len(v) for v in figure_descriptions.values()
+ )
+ logger.info(
+ "Described %d/%d figures",
+ described, len(significant_images),
+ )
+
+ # Build text with figure descriptions appended per page
text_parts: list[str] = []
for page in pages:
markdown = getattr(page, "markdown", None)
@@ -104,6 +293,21 @@ def _pages_to_markdown_text(ocr_response: object) -> str:
if markdown and markdown.strip():
text_parts.append(markdown.strip())
+ page_index = getattr(page, "index", None)
+ if page_index is None and isinstance(page, dict):
+ page_index = page.get("index", 0)
+
+ for desc in figure_descriptions.get(page_index, []):
+ # Parse title from first line if model produced one
+ lines = desc.strip().split("\n", 1)
+ title_line = lines[0].strip().strip("*#").strip()
+ body = lines[1].strip() if len(lines) > 1 else title_line
+ if title_line and title_line != body:
+ header = f"Figure (page {page_index + 1}): {title_line}"
+ else:
+ header = f"Figure (page {page_index + 1})"
+ text_parts.append(f"{header}\n\n{body}")
+
return "\n\n".join(text_parts)
@@ -140,7 +344,7 @@ def extract_text_from_pdf(filepath: Path) -> str | None:
"type": "document_url",
"document_url": signed_url.url,
},
- include_image_base64=False,
+ include_image_base64=True,
)
text = _pages_to_markdown_text(ocr_response)
return text if text.strip() else None
diff --git a/scripts/eval_figure_chat_answers.py b/scripts/eval_figure_chat_answers.py
new file mode 100644
index 00000000..c7f0a8af
--- /dev/null
+++ b/scripts/eval_figure_chat_answers.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""Integration test: verify figure-derived knowledge appears in chat answers.
+
+Validates the full pipeline:
+ PDF -> OCR -> figure description -> chunk -> retrieval -> chat answer
+
+Requires IRPAPERS PDF ingested with figure descriptions enabled.
+
+Usage:
+ PYTHONPATH=. python scripts/eval_figure_chat_answers.py
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+FIGURE_QUERIES = [
+ {
+ "query": (
+ "What recall@20 does multimodal hybrid search achieve "
+ "in the IRPAPERS benchmark?"
+ ),
+ "expected_terms": ["0.95", "95%", "95 %"],
+ },
+ {
+ "query": (
+ "Which retrieval method performs best at Recall@1?"
+ ),
+ "expected_terms": ["0.49", "49%", "49 %"],
+ },
+ {
+ "query": (
+ "What components are in the multimodal retrieval "
+ "pipeline described in the paper?"
+ ),
+ "expected_terms": [
+ "BM25", "Arctic", "ColModernVBERT",
+ "image embedding", "fusion",
+ ],
+ },
+]
+
+SANITY_QUERY = (
+ "What workflow patterns does Anthropic recommend "
+ "for building AI agents?"
+)
+
+FIGURE_MARKER = "Figure (page"
+
+
+def check_preconditions():
+ """Verify figure chunks exist in the corpus."""
+ import chromadb
+
+ from backend.config import get_settings
+
+ settings = get_settings()
+ client = chromadb.PersistentClient(path=settings.chroma_path)
+ collection = client.get_collection("global")
+
+ results = collection.get(include=["documents"])
+ all_docs = results["documents"]
+ figure_docs = [
+ d for d in all_docs if FIGURE_MARKER in d
+ ]
+
+ print("=== Figure chunk check ===")
+ print(f"Total chunks in global: {len(all_docs)}")
+ print(f"Figure chunks: {len(figure_docs)}")
+
+ if not figure_docs:
+ print(
+ "FAIL: No figure chunks found. "
+ "Ingest a PDF with figure descriptions first."
+ )
+ sys.exit(1)
+
+ preview = figure_docs[0]
+ start = preview.find(FIGURE_MARKER)
+ print(f"Example: {preview[start:start + 80]}...")
+ print()
+ return len(figure_docs)
+
+
+async def run_query(query: str) -> dict:
+ """Run a query through the full chat pipeline."""
+ from backend.services.chat import process_chat
+
+ resp = await process_chat(
+ message=query,
+ namespace="All",
+ use_query_expansion=True,
+ )
+
+ sources = []
+ for src in resp.sources[:5]:
+ text = src.full_text or src.preview or ""
+ sources.append(text)
+
+ return {
+ "answer": resp.answer,
+ "sources": sources,
+ }
+
+
+def find_figure_rank(sources: list[str]) -> int | None:
+ """Return 1-based rank of first source containing a figure."""
+ for i, text in enumerate(sources):
+ if FIGURE_MARKER in text:
+ return i + 1
+ return None
+
+
+def check_answer_terms(
+ answer: str, expected_terms: list[str],
+) -> bool:
+ """Check if the answer contains at least one expected term."""
+ lower = answer.lower()
+ return any(t.lower() in lower for t in expected_terms)
+
+
+async def main():
+ fig_count = check_preconditions()
+
+ print("=== Query tests ===")
+ all_passed = True
+
+ for i, case in enumerate(FIGURE_QUERIES, 1):
+ query = case["query"]
+ result = await run_query(query)
+
+ rank = find_figure_rank(result["sources"])
+ has_terms = check_answer_terms(
+ result["answer"], case["expected_terms"],
+ )
+
+ rank_ok = rank is not None and rank <= 5
+ passed = rank_ok and has_terms
+
+ print(f"Q{i}: {query[:60]}...")
+ print(f" Answer: {result['answer'][:200]}...")
+ print(f" Figure chunk rank: {rank or '>5'}")
+ print(f" Expected terms in answer: {has_terms}")
+ print(f" {'PASS' if passed else 'FAIL'}")
+ print()
+
+ if not passed:
+ all_passed = False
+
+ # Sanity check: non-figure query
+ print("=== Sanity check ===")
+ sanity = await run_query(SANITY_QUERY)
+ sanity_rank = find_figure_rank(sanity["sources"])
+ sanity_ok = sanity_rank is None
+
+ print(f"Non-figure query: {SANITY_QUERY[:50]}...")
+ print(
+ f" Figure chunk in sources: "
+ f"{'yes (unexpected)' if sanity_rank else 'no'}"
+ )
+ print(f" {'PASS' if sanity_ok else 'FAIL'}")
+ print()
+
+ if not sanity_ok:
+ all_passed = False
+
+ # Final verdict
+ status = "PASS" if all_passed else "FAIL"
+ print(f"MULTIMODAL CHAT TEST: {status}")
+
+ if not all_passed:
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ import logging
+
+ logging.basicConfig(level=logging.WARNING)
+ asyncio.run(main())
diff --git a/scripts/eval_figure_retrieval.py b/scripts/eval_figure_retrieval.py
new file mode 100644
index 00000000..1e8af69e
--- /dev/null
+++ b/scripts/eval_figure_retrieval.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+"""Evaluate whether figure-description chunks improve retrieval.
+
+Runs figure-dependent queries against the corpus and measures
+whether chunks containing 'Figure (page' appear in top-5 results.
+
+Metrics:
+ Hit@5 - fraction of queries where a figure chunk is in top 5
+ MRR - mean reciprocal rank of first figure chunk
+
+Usage:
+ PYTHONPATH=. python scripts/eval_figure_retrieval.py
+"""
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from backend.services.retrieval.retriever import get_retriever
+
+QUERIES = [
+ {
+ "query": "What recall@20 does multimodal hybrid search achieve?",
+ "expects_figure": True,
+ },
+ {
+ "query": "Which retrieval method performs best at Recall@1?",
+ "expects_figure": True,
+ },
+ {
+ "query": (
+ "What methods are compared in the IRPAPERS benchmark chart?"
+ ),
+ "expects_figure": True,
+ },
+ {
+ "query": (
+ "What components are part of the multimodal retrieval pipeline?"
+ ),
+ "expects_figure": True,
+ },
+ {
+ "query": "Which models are evaluated in the IRPAPERS benchmark?",
+ "expects_figure": True,
+ },
+]
+
+K = 5 # Top-K for Hit@K
+
+
+def main():
+ retriever = get_retriever(namespace="global")
+
+ hits = 0
+ reciprocal_ranks = []
+
+ for case in QUERIES:
+ query = case["query"]
+ docs, timing = retriever.retrieve_with_timing(query)
+ scores = timing.get("scores", [0.0] * len(docs))
+
+ print(f"QUERY: {query}")
+ print("TOP RESULTS:")
+
+ figure_rank = None
+ for i, (doc, score) in enumerate(zip(docs[:K], scores[:K])):
+ content = doc.page_content[:130].replace("\n", " ")
+ is_fig = "Figure (page" in doc.page_content
+ marker = " << FIGURE" if is_fig else ""
+ print(f" {i+1}. ({score:.3f}) {content}...{marker}")
+ if is_fig and figure_rank is None:
+ figure_rank = i + 1
+
+ if figure_rank:
+ hits += 1
+ reciprocal_ranks.append(1.0 / figure_rank)
+ print(f"FIGURE RANK: {figure_rank}")
+ else:
+ reciprocal_ranks.append(0.0)
+ print("FIGURE RANK: not in top 5")
+ print()
+
+ n = len(QUERIES)
+ hit_rate = hits / n
+ mrr = sum(reciprocal_ranks) / n
+
+ print("=" * 50)
+ print(f"Hit@{K}: {hit_rate:.2f} ({hits}/{n} queries)")
+ print(f"MRR: {mrr:.3f}")
+ print("=" * 50)
+
+
+if __name__ == "__main__":
+ main()