scaleborg · scaleborg · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/Makefile b/Makefile
@@ -7,7 +7,7 @@ ENTERPRISE_USE_CASES_CORPUS ?= evals/datasets/enterprise_use_cases_corpus.jsonl
 CAREER_EVAL_K_VALUES ?= 3,5,10
 CAREER_EVAL_OUTPUT_DIR ?= evals/runs
 
-.PHONY: help install test lint lint-ci eval eval-career baseline seed-golden build-job-ads-corpus build-enterprise-use-cases-corpus docs-openapi dev dev-stop dev-status dev-backend dev-frontend
+.PHONY: help install test lint lint-ci eval eval-career eval-figure-chat baseline seed-golden build-job-ads-corpus build-enterprise-use-cases-corpus docs-openapi dev dev-stop dev-status dev-backend dev-frontend
 
 help:
 	@echo "Available targets:"
@@ -17,6 +17,7 @@ help:
 	@echo "  make lint-ci       - Run Ruff lint checks (strict)"
 	@echo "  make eval          - Run full eval set (requires DATASET)"
 	@echo "  make eval-career   - Run offline career recommendation metrics from impression/feedback logs"
+	@echo "  make eval-figure-chat - Verify figure-derived knowledge appears in chat answers"
 	@echo "  make baseline      - Run full eval and persist results under evals/runs/"
 	@echo "  make seed-golden   - Seed/update golden eval dataset"
 	@echo "  make build-job-ads-corpus - Build canonical enterprise job ads corpus JSONL"
@@ -74,6 +75,9 @@ eval-career:
 		--k-values "$(CAREER_EVAL_K_VALUES)" \
 		--output-dir "$(CAREER_EVAL_OUTPUT_DIR)"
 
+eval-figure-chat:
+	PYTHONPATH=. $(PYTHON) scripts/eval_figure_chat_answers.py
+
 baseline:
 	@if [ ! -f "$(DATASET)" ]; then \
 		echo "Dataset not found: $(DATASET)"; \

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -1,6 +1,19 @@
 # Architecture
 
-Last updated: 2026-03-12
+Last updated: 2026-03-14
+
+## Two-Layer Architecture
+
+The system derives two complementary layers from the same ingested corpus:
+
+1. **Retrieval layer** — sources → chunks → embeddings (ChromaDB + FTS5) → hybrid retrieval → chat
+2. **Knowledge layer** — sources → learnings → concepts → atoms → dossiers → curriculum
+
+The retrieval layer serves recall (finding relevant passages).
+The knowledge layer serves understanding (structured definitions, formulas, prerequisites, insights).
+
+Both layers are read together at query time: the dossier provides the knowledge substrate,
+retrieved chunks provide supporting evidence.
 
 ## System Overview
 
@@ -66,6 +79,23 @@ Content flows through three main paths:
 
 **Ingest** — A URL (YouTube video, article, PDF) enters the pipeline. The system fetches the content, cleans it, chunks it, assesses relevance and quality, extracts learnings (skills, concepts, tools) via LLM, stores chunks in ChromaDB + FTS5, and saves structured learnings to SQLite. The pipeline runs on-demand from the UI or on a daily cron schedule.
 
+For PDFs, figures and diagrams are detected and described before chunking:
+
+```mermaid
+flowchart LR
+    PDF --> OCR["Mistral OCR"]
+    OCR --> Text["Page text"]
+    OCR --> Imgs["Page images"]
+    Imgs --> Filter["Filter >2% page area"]
+    Filter --> Vision["Gemini 2.5 Flash"]
+    Vision --> Desc["Figure descriptions"]
+    Text --> Merge["Merged text"]
+    Desc --> Merge
+    Merge --> Chunk["Chunk + embed + store"]
+```
+
+This captures chart values, method names, and diagram relationships as retrievable text. No `GEMINI_API_KEY` = figures silently skipped, OCR text still flows through.
+
 **Retrieve + Chat** — When a user asks a question, the router classifies intent into one of six buckets: `clear_meta`, `ambiguous`, `general`, `content`, `web_search`, or `skill`. Content questions pass through a **query rewriter** that resolves multi-turn follow-ups ("tell me more about it") into standalone queries ("Python usage by Nestlé data teams") using conversation history. The rewritten query drives both hybrid retrieval (vector + BM25 + Cohere rerank) and final answer generation — ensuring the LLM stays anchored to the specific topic, not the general source. Answers stream back via SSE with explicit provenance (`answer_origin`: `library_rag`, `web_rag`, `skill`, `policy`, or `general`).
 
 **Learn** — When learnings are confirmed, the system canonicalizes concepts and triggers background atom extraction. An LLM extracts structured knowledge atoms (definitions, formulas with LaTeX, examples, prerequisite claims, key insights) from source text, grounds each atom by verifying its context appears verbatim in the source, and aggregates atoms across sources into a concept dossier. When a user requests an explanation, the dossier serves as the primary structured input ("knowledge substrate") while retrieved chunks provide supporting context. Teaching profiles (7 levels from primary school to advanced professional) and domain lenses (9 disciplines) adapt the explanation pedagogy and framing.

diff --git a/docs/architecture/ingestion.md b/docs/architecture/ingestion.md
@@ -14,6 +14,20 @@ Key models per step:
 - **embed**: text-embedding-3-small (OpenAI)
 - **contextualize**: claude-haiku-4-5 (chunk context headers)
 
+### PDF Figure Descriptions
+
+PDFs are processed with Mistral OCR (`include_image_base64=True`). Significant figures (>2% of page area) are described by Gemini 2.5 Flash in parallel. Each description is appended to the page text before chunking:
+
+```
+Figure (page 8): Recall Performance of Various Search Models
+
+This bar chart compares six retrieval methods across Recall@1,
+Recall@5, and Recall@20. Multimodal Hybrid Search achieves the
+highest Recall@20 at approximately 0.95.
+```
+
+This makes chart values, method names, and diagram relationships retrievable as chunk content. Descriptions are generated at ingest time — no changes to embeddings, retrieval, or downstream extraction. If `GEMINI_API_KEY` is not set, figure descriptions are silently skipped.
+
 ## Post-Confirmation: Atom Extraction
 
 After a user confirms learnings (`confirm_learnings()`), two additional steps run:

diff --git a/docs/critical-path.md b/docs/critical-path.md
@@ -23,6 +23,10 @@ Source: `backend/services/ingest/pipeline.py`
 ```
 1. fetch_youtube() [Supadata API] / fetch_article() [Firecrawl] / fetch_file()
    → backend/services/ingest/steps/fetch.py
+   For PDFs: extract_text_from_pdf() uses Mistral OCR with include_image_base64=True.
+   Significant figures (>2% page area) are described by Gemini 2.5 Flash in parallel.
+   Descriptions are appended as "Figure (page N): <title>\n\n<paragraph>" blocks.
+   → file_ingest.py
 
 2. save_transcript_raw() + download_youtube_thumbnail()
    → backend/services/ingest/steps/persist_assets.py
@@ -301,6 +305,8 @@ GET /api/career/recommend/jobs
 | Chunk contextualization | claude-haiku-4-5 | Anthropic |
 | Atom extraction (learning) | claude-haiku-4-5 | Anthropic |
 | Extended thinking (projects) | claude-opus-4-6 | Anthropic |
+| PDF OCR | mistral-ocr-latest | Mistral |
+| PDF figure descriptions | gemini-2.5-flash | Google |
 
 ## Key Thresholds
 

diff --git a/docs/engineering.md b/docs/engineering.md
@@ -201,7 +201,7 @@ Supported ingestion paths:
 - `python file_ingest.py ...`
 - `python dropbox_courses.py ...`
 
-PDF handling in current runtime is driven through file/upload ingestion with OCR support.
+PDF handling uses Mistral OCR with figure detection. Significant figures (>2% page area) are described by Gemini 2.5 Flash and inserted into the text stream before chunking, making chart values and diagram relationships retrievable.
 
 ### Ingestion invariants
 
@@ -255,6 +255,8 @@ We currently use approach 1 informally. The required sources are not yet scripte
 
 **Multi-turn test cases** (prefixed `mt_`) additionally require conversation state. The eval runner sends `prior_turns` before the final `user_input` to build context, then evaluates only the final response.
 
+**Figure-dependent test cases** (prefixed `tc_fig_`, tagged `figure`) query content extracted from PDF figures (charts, diagrams). These require the source PDF to be ingested with figure descriptions enabled. A separate retrieval-only eval (`scripts/eval_figure_retrieval.py`) measures Hit@5 and MRR for figure chunks specifically.
+
 References:
 - [Building a Golden Dataset for AI Evaluation (Maxim)](https://www.getmaxim.ai/articles/building-a-golden-dataset-for-ai-evaluation-a-step-by-step-guide/)
 - [The Path to a Golden Dataset (Microsoft)](https://medium.com/data-science-at-microsoft/the-path-to-a-golden-dataset-or-how-to-evaluate-your-rag-045e23d1f13f)

diff --git a/evals/datasets/golden_dataset.jsonl b/evals/datasets/golden_dataset.jsonl
@@ -1,4 +1,4 @@
-{"_version": "2.0.0", "_count": 15}
+{"_version": "2.1.0", "_count": 18}
 {"id": "tc_001", "user_input": "What workflow patterns does Anthropic recommend for building AI agents?", "reference": "Anthropic identifies five key workflow patterns: prompt chaining (sequential LLM steps with gates), routing (classify input to specialized prompts), parallelization (sectioning or voting), orchestrator-workers (central LLM delegates to workers), and evaluator-optimizer (generate then evaluate in a loop). They recommend starting with the simplest solution and only adding complexity when needed.", "reference_contexts": ["Prompt chaining decomposes a task into a sequence of steps, where each LLM call processes the output of the previous one.", "Routing classifies an input and directs it to a specialized followup task. This workflow allows for separation of concerns, and building more specialized prompts.", "In the orchestrator-workers workflow, a central LLM dynamically breaks down tasks, delegates them to worker LLMs, and synthesizes their results.", "In the evaluator-optimizer workflow, one LLM call generates a response while another provides evaluation and feedback in a loop."], "reference_source_ids": ["article_ffc229d83891", "article_ffc229d83891", "article_ffc229d83891", "article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "easy", "created_at": "2026-03-06T11:00:00", "tags": ["curated"]}
 {"id": "tc_002", "user_input": "When should you use agents versus simpler LLM solutions?", "reference": "Anthropic recommends finding the simplest solution possible and only increasing complexity when needed. Agentic systems trade latency and cost for better task performance. You should consider when this tradeoff is acceptable. Agents are best for tasks requiring complex reasoning, planning, tool use, and recovery from errors. For simpler tasks, workflows or even single LLM calls are preferable.", "reference_contexts": ["When building applications with LLMs, we recommend finding the simplest solution possible, and only increasing complexity when needed. This might mean not building agentic systems at all. Agentic systems often trade latency and cost for better task performance", "Agents are emerging in production as LLMs mature in key capabilities—understanding complex inputs, engaging in reasoning and planning, using tools reliably, and recovering from errors."], "reference_source_ids": ["article_ffc229d83891", "article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "easy", "created_at": "2026-03-06T11:00:00", "tags": ["curated"]}
 {"id": "tc_003", "user_input": "What is prompt chaining and when is it useful?", "reference": "Prompt chaining decomposes a task into a sequence of steps where each LLM call processes the output of the previous one. You can add programmatic checks (gates) on intermediate steps to ensure the process stays on track. It is useful when a task can be cleanly decomposed into fixed subtasks, trading latency for higher accuracy by making each LLM call simpler and more focused.", "reference_contexts": ["Prompt chaining decomposes a task into a sequence of steps, where each LLM call processes the output of the previous one. You can add programmatic checks (see \"gate\" in the diagram below) on any intermediate steps to ensure that the process is still on track."], "reference_source_ids": ["article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "easy", "created_at": "2026-03-06T11:00:00", "tags": ["curated"]}
@@ -14,3 +14,6 @@
 {"id": "mt_001", "user_input": "tell me more about the routing one", "prior_turns": ["What workflow patterns does Anthropic describe for AI agents?"], "reference": "Routing classifies an input and directs it to a specialized followup task. This allows separation of concerns and building more specialized prompts. Without routing, optimizing for one kind of input can hurt performance on other inputs. It is useful when there are distinct categories of input that need different handling.", "reference_contexts": ["Routing classifies an input and directs it to a specialized followup task. This workflow allows for separation of concerns, and building more specialized prompts. Without this workflow, optimizing for one kind of input can hurt performance on other inputs."], "reference_source_ids": ["article_ffc229d83891"], "namespace": "All", "category": "explanatory", "difficulty": "medium", "created_at": "2026-03-06T11:00:00", "tags": ["multi_turn", "curated"]}
 {"id": "mt_002", "user_input": "how does that compare to using BM25 alone?", "prior_turns": ["What is Contextual Retrieval?"], "reference": "Contextual Retrieval with embeddings alone reduces failed retrievals by 35% compared to standard embeddings. Adding Contextual BM25 on top of contextual embeddings reduces failures by 49%. BM25 alone finds matches based on exact terms using TF-IDF, but misses semantic similarity. Combining contextual embeddings with contextual BM25 via rank fusion gives much better coverage than either alone.", "reference_contexts": ["This method can reduce the number of failed retrievals by 49% and, when combined with reranking", "BM25 works by building upon the TF-IDF (Term Frequency-Inverse Document Frequency) concept."], "reference_source_ids": ["article_33d811167435", "article_33d811167435"], "namespace": "All", "category": "explanatory", "difficulty": "medium", "created_at": "2026-03-06T11:00:00", "tags": ["multi_turn", "curated"]}
 {"id": "mt_003", "user_input": "what about the cost?", "prior_turns": ["How does Contextual Retrieval work?", "Does reranking help?"], "reference": "Contextual Retrieval is cost-effective thanks to Claude's prompt caching - you don't need to pass in the reference document for every chunk. For reranking, there is a small latency and cost impact since it adds an extra step at runtime, but the reranker scores all chunks in a single pass. The overall cost-performance tradeoff is favorable given the significant reduction in retrieval failures.", "reference_contexts": ["With prompt caching, you don't need to pass in the reference document for every chunk.", "One important consideration with reranking is the impact on latency and cost, especially when reranking a large number of chunks. Because reranking adds an extra step at runtime, it inevitably adds a small amount of latency, even though the reranker scores all chunks in a single pass"], "reference_source_ids": ["article_33d811167435", "article_33d811167435"], "namespace": "All", "category": "explanatory", "difficulty": "hard", "created_at": "2026-03-06T11:00:00", "tags": ["multi_turn", "curated"]}
+{"id": "tc_fig_001", "user_input": "What recall@20 does multimodal hybrid search achieve in the IRPAPERS benchmark?", "reference": "Multimodal Hybrid Search achieves approximately 0.95 Recall@20 in the IRPAPERS benchmark, outperforming all other methods including ColModernVBERT (0.93), Hybrid Text Search (0.91), and BM25 (0.90).", "reference_contexts": ["This bar chart illustrates the Recall performance of six different search models across three evaluation points: Recall @ 1, Recall @ 5, and Recall @ 20. The models evaluated are ColModernVBERT, ColModernVBERT with MUVERA, Arctic 2.0, BM25, Hybrid Text Search, and Multimodal Hybrid Search. At Recall @ 20, Multimodal Hybrid Search demonstrates the best performance with a recall of 0.95."], "reference_source_ids": ["doc_2b8c2a754395"], "namespace": "All", "category": "factoid", "difficulty": "medium", "created_at": "2026-03-14T20:00:00", "tags": ["figure", "curated"]}
+{"id": "tc_fig_002", "user_input": "Which retrieval method performs best at Recall@1 in the IRPAPERS benchmark?", "reference": "Multimodal Hybrid Search performs best at Recall@1 with a score of 0.49, outperforming Hybrid Text Search (0.46), BM25 (0.45), Arctic 2.0 (0.44), ColModernVBERT (0.43), and ColModernVBERT with MUVERA (0.41).", "reference_contexts": ["At Recall @ 1, the Multimodal Hybrid Search achieves the highest recall of 0.49, outperforming ColModernVBERT (0.43) and ColModernVBERT with MUVERA (0.41)."], "reference_source_ids": ["doc_2b8c2a754395"], "namespace": "All", "category": "factoid", "difficulty": "medium", "created_at": "2026-03-14T20:00:00", "tags": ["figure", "curated"]}
+{"id": "tc_fig_003", "user_input": "What components are included in the multimodal retrieval pipeline shown in the IRPAPERS paper?", "reference": "The IRPAPERS multimodal retrieval pipeline processes papers through three pathways: page images are converted to multi-vector image embeddings, OCR transcription feeds into BM25 for lexical retrieval, and OCR transcription is also used to create single-vector text embeddings. All three signals are fused in a final retrieval and QA analysis stage.", "reference_contexts": ["This figure illustrates a system for retrieval and question-answering (QA) analysis that leverages multiple data representations. The process begins with IRPAPERS, which are processed into both Page Image and OCR Transcription formats. The Page Image is then transformed into Multi-Vector Image Embeddings. Concurrently, the OCR Transcription feeds into two distinct text embedding methods: BM25 and Single-Vector Text Embeddings."], "reference_source_ids": ["doc_2b8c2a754395"], "namespace": "All", "category": "explanatory", "difficulty": "medium", "created_at": "2026-03-14T20:00:00", "tags": ["figure", "curated"]}