Add two-pass LLM summarization pipeline for high-fidelity strategy extraction

SL-Mar · claude · SL-Mar · commit 388c7fd0df99 · 2026-02-10T10:25:41.000+01:00
Replace the keyword-filter → rigid-template single-pass path with a two-pass
pipeline: Pass 1 extracts verbatim quotes from full paper sections, Pass 2
converts them into a flexible strategy spec. KeywordAnalyzer kept as automatic
fallback if either LLM pass fails. 38 tests pass.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/quantcoder/core/llm.py b/quantcoder/core/llm.py
@@ -71,6 +71,162 @@ def __init__(self, config):
             f"summary: {self._summary_llm.get_model_name()}"
         )
 
+    # -- Two-pass summarization helpers ----------------------------------
+
+    @staticmethod
+    def _format_sections_for_prompt(
+        sections: Dict[str, str], max_chars: int = 60000
+    ) -> str:
+        """Format paper sections into a single string within a token budget.
+
+        Prioritizes methodology-relevant sections and truncates low-priority
+        ones (acknowledgments, references, appendix) when over budget.
+        """
+        HIGH_PRIORITY_KEYWORDS = {
+            "method", "model", "strategy", "trading", "signal", "algorithm",
+            "approach", "result", "experiment", "implementation", "data",
+            "feature", "regression", "portfolio", "backtest", "return",
+            "risk", "parameter", "calibration", "estimation", "framework",
+        }
+        LOW_PRIORITY_KEYWORDS = {
+            "acknowledg", "reference", "bibliography", "appendix", "vita",
+            "disclosure", "funding", "supplementar",
+        }
+
+        def _priority(name: str) -> int:
+            lower = name.lower()
+            if any(kw in lower for kw in LOW_PRIORITY_KEYWORDS):
+                return 2
+            if any(kw in lower for kw in HIGH_PRIORITY_KEYWORDS):
+                return 0
+            return 1
+
+        ordered = sorted(sections.items(), key=lambda kv: (_priority(kv[0]), kv[0]))
+
+        parts: list[str] = []
+        total = 0
+        for name, text in ordered:
+            header = f"\n### {name}\n"
+            available = max_chars - total - len(header)
+            if available <= 0:
+                break
+            chunk = text[:available]
+            parts.append(header + chunk)
+            total += len(header) + len(chunk)
+
+        return "".join(parts)
+
+    def extract_key_passages(self, sections: Dict[str, str]) -> Optional[str]:
+        """Pass 1 — Extractive: quote verbatim passages relevant to implementation."""
+        self.logger.info("Two-pass pipeline — Pass 1 (extract key passages)")
+
+        formatted = self._format_sections_for_prompt(sections)
+        if not formatted.strip():
+            self.logger.warning("No section text to send to LLM")
+            return None
+
+        system = (
+            "You are a quantitative finance research analyst. "
+            "Read the paper sections below and QUOTE VERBATIM every passage that "
+            "is relevant to implementing the described trading strategy or model. "
+            "Do NOT paraphrase — copy the exact text. Do NOT skip passages because "
+            "they use unfamiliar terms, novel formulas, or custom model names.\n\n"
+            "Focus on:\n"
+            "- Mathematical formulas and equations\n"
+            "- Parameter values and calibration details\n"
+            "- Entry and exit rules or signal generation logic\n"
+            "- Risk controls, stop-loss rules, position sizing\n"
+            "- Novel indicators or custom models (e.g., OU process, HMM, "
+            "regime-switching, proprietary scores)\n"
+            "- Universe selection and data requirements\n"
+            "- Execution details (order types, rebalancing frequency)\n\n"
+            "Output format: a numbered list of verbatim quotes, each preceded by "
+            "the section name in brackets. Example:\n"
+            "[Methodology] \"We define the signal as ...\"\n"
+        )
+
+        prompt = f"Extract all implementable passages from this paper:\n{formatted}"
+
+        try:
+            messages = [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ]
+            result = _run_async(
+                self._summary_llm.chat(
+                    messages=messages, max_tokens=4096, temperature=0.1
+                )
+            )
+            self.logger.info(
+                f"Pass 1 complete — extracted {len(result)} chars of passages"
+            )
+            return result
+        except Exception as e:
+            self.logger.error(f"Pass 1 (extract_key_passages) failed: {e}")
+            return None
+
+    def interpret_strategy(self, extractions: str) -> Optional[str]:
+        """Pass 2 — Interpretive: convert verbatim quotes into strategy spec."""
+        self.logger.info("Two-pass pipeline — Pass 2 (interpret strategy)")
+
+        if not extractions or not extractions.strip():
+            self.logger.warning("Empty extractions — nothing to interpret")
+            return None
+
+        system = (
+            "You are a quantitative trading strategist. Convert the verbatim "
+            "paper quotes below into a precise, implementable strategy "
+            "specification. Base your output ONLY on what the quotes say. "
+            "If the paper uses an OU process, specify an OU process — do NOT "
+            "substitute RSI or SMA. If the paper describes a proprietary model "
+            "or custom indicator, describe it faithfully.\n\n"
+            "Use the following flexible structure. Skip any section that is "
+            "genuinely irrelevant to this particular strategy:\n\n"
+            "## STRATEGY OVERVIEW\n"
+            "One paragraph summarizing the core idea.\n\n"
+            "## MATHEMATICAL MODEL\n"
+            "Formulas, distributions, state dynamics — as described in the paper.\n\n"
+            "## SIGNAL GENERATION\n"
+            "Exact entry/exit conditions with numeric thresholds.\n\n"
+            "## EXIT RULES\n"
+            "Stop loss, profit target, time stop, trailing stop — with exact values.\n\n"
+            "## RISK MANAGEMENT\n"
+            "Position sizing, max exposure, drawdown limits.\n\n"
+            "## UNIVERSE / STOCK SELECTION\n"
+            "Market, filters, number of instruments.\n\n"
+            "## EXECUTION DETAILS\n"
+            "Order types, rebalancing frequency, data resolution.\n\n"
+            "## PARAMETER TABLE\n"
+            "| Parameter | Value | Source |\n"
+            "|-----------|-------|--------|\n"
+            "Every numeric parameter from the paper with source attribution.\n"
+        )
+
+        prompt = (
+            "Convert these verbatim paper extractions into an implementable "
+            f"strategy specification:\n\n{extractions}"
+        )
+
+        try:
+            messages = [
+                {"role": "system", "content": system},
+                {"role": "user", "content": prompt},
+            ]
+            result = _run_async(
+                self._summary_llm.chat(
+                    messages=messages, max_tokens=4096, temperature=0.3
+                )
+            )
+            self.logger.info(
+                f"Pass 2 complete — strategy spec is {len(result)} chars"
+            )
+            return result
+        except Exception as e:
+            self.logger.error(f"Pass 2 (interpret_strategy) failed: {e}")
+            return None
+
+    # -- Legacy single-pass summary (kept intact) -------------------------
+
     def generate_summary(self, extracted_data: Dict[str, List[str]]) -> Optional[str]:
         """Generate a structured trading strategy summary for algorithm generation."""
         self.logger.info("Generating summary")
diff --git a/quantcoder/core/processor.py b/quantcoder/core/processor.py
@@ -248,8 +248,33 @@ def __init__(self, config, max_refine_attempts: int = 6):
         self.llm_handler = LLMHandler(config)
         self.max_refine_attempts = max_refine_attempts
 
+    def extract_sections(self, pdf_path: str) -> Dict[str, str]:
+        """Extract paper sections from PDF (no keyword filtering).
+
+        Returns:
+            Dict mapping section names to their full text.
+        """
+        self.logger.info(f"Extracting sections from PDF: {pdf_path}")
+
+        raw_text = self.pdf_loader.load_pdf(pdf_path)
+        if not raw_text:
+            self.logger.error("No text extracted from PDF")
+            return {}
+
+        preprocessed_text = self.preprocessor.preprocess_text(raw_text)
+        if not preprocessed_text:
+            self.logger.error("Preprocessing failed")
+            return {}
+
+        headings = self.heading_detector.detect_headings(preprocessed_text)
+        if not headings:
+            self.logger.warning("No headings detected. Using default sectioning")
+
+        sections = self.section_splitter.split_into_sections(preprocessed_text, headings)
+        return dict(sections)
+
     def extract_structure(self, pdf_path: str) -> Dict[str, List[str]]:
-        """Extract structured data from PDF."""
+        """Extract structured data from PDF (legacy keyword-filtered path)."""
         self.logger.info(f"Starting extraction for PDF: {pdf_path}")
 
         raw_text = self.pdf_loader.load_pdf(pdf_path)
@@ -272,20 +297,51 @@ def extract_structure(self, pdf_path: str) -> Dict[str, List[str]]:
         return keyword_analysis
 
     def generate_summary(self, extracted_data: Dict[str, List[str]]) -> Optional[str]:
-        """Generate summary from extracted data."""
+        """Generate summary from extracted data (legacy single-pass)."""
         return self.llm_handler.generate_summary(extracted_data)
 
-    def extract_structure_and_generate_code(self, pdf_path: str) -> Dict:
-        """Extract structure and generate QuantConnect code."""
-        self.logger.info("Starting extraction and code generation")
+    def generate_two_pass_summary(self, pdf_path: str) -> Optional[str]:
+        """Two-pass LLM summarization: extract then interpret.
+
+        Falls back to the legacy keyword-filtered path if either LLM pass fails.
+        """
+        self.logger.info("Starting two-pass summarization pipeline")
+
+        # Step 1 — get full sections (no keyword filter)
+        sections = self.extract_sections(pdf_path)
+        if not sections:
+            self.logger.warning("No sections extracted, falling back to legacy path")
+            return self._legacy_summarize(pdf_path)
+
+        # Step 2 — Pass 1: extract verbatim quotes
+        extractions = self.llm_handler.extract_key_passages(sections)
+        if not extractions:
+            self.logger.warning("Pass 1 failed, falling back to legacy path")
+            return self._legacy_summarize(pdf_path)
+
+        # Step 3 — Pass 2: interpret into strategy spec
+        summary = self.llm_handler.interpret_strategy(extractions)
+        if not summary:
+            self.logger.warning("Pass 2 failed, falling back to legacy path")
+            return self._legacy_summarize(pdf_path)
+
+        self.logger.info("Two-pass summarization complete")
+        return summary
 
+    def _legacy_summarize(self, pdf_path: str) -> Optional[str]:
+        """Legacy single-pass summarization via KeywordAnalyzer + rigid template."""
+        self.logger.info("Using legacy summarization path")
         extracted_data = self.extract_structure(pdf_path)
         if not extracted_data:
-            self.logger.error("No data extracted for code generation")
-            return {"summary": None, "code": None}
+            return None
+        return self.llm_handler.generate_summary(extracted_data)
+
+    def extract_structure_and_generate_code(self, pdf_path: str) -> Dict:
+        """Extract structure and generate QuantConnect code."""
+        self.logger.info("Starting extraction and code generation")
 
-        # Generate summary
-        summary = self.llm_handler.generate_summary(extracted_data)
+        # Use two-pass pipeline (with automatic legacy fallback)
+        summary = self.generate_two_pass_summary(pdf_path)
         if not summary:
             self.logger.error("Failed to generate summary")
             summary = "Summary could not be generated."
diff --git a/quantcoder/tools/article_tools.py b/quantcoder/tools/article_tools.py
@@ -400,15 +400,8 @@ def execute(self, article_ids: List[int]) -> ToolResult:
                 # Get article metadata
                 article_meta = articles[article_id - 1]
 
-                # Process the article
-                extracted_data = processor.extract_structure(str(filepath))
-
-                if not extracted_data:
-                    self.logger.warning(f"Failed to extract data from article {article_id}")
-                    continue
-
-                # Generate summary
-                summary_text = processor.generate_summary(extracted_data)
+                # Process the article (two-pass pipeline with legacy fallback)
+                summary_text = processor.generate_two_pass_summary(str(filepath))
 
                 if not summary_text:
                     self.logger.warning(f"Failed to generate summary for article {article_id}")
diff --git a/tests/test_llm.py b/tests/test_llm.py
@@ -106,3 +106,121 @@ def test_strip_markdown_no_fence(self):
         """Test static _strip_markdown without fence."""
         text = "def test():\n    pass"
         assert LLMHandler._strip_markdown(text) == text
+
+
+class TestFormatSectionsForPrompt:
+    """Tests for LLMHandler._format_sections_for_prompt."""
+
+    def test_under_budget(self):
+        """All sections fit within budget."""
+        sections = {"Methodology": "Some methods.", "Results": "Some results."}
+        result = LLMHandler._format_sections_for_prompt(sections, max_chars=10000)
+        assert "Methodology" in result
+        assert "Results" in result
+
+    def test_over_budget_truncates_low_priority(self):
+        """Low-priority sections are excluded when budget is tight."""
+        sections = {
+            "Trading Strategy": "A" * 500,
+            "References": "B" * 500,
+        }
+        # Budget only allows ~550 chars total (header + content)
+        result = LLMHandler._format_sections_for_prompt(sections, max_chars=550)
+        assert "Trading Strategy" in result
+        # References may be partially included or excluded
+        assert len(result) <= 600  # some tolerance for headers
+
+    def test_high_priority_sections_first(self):
+        """High-priority sections appear before medium-priority ones."""
+        sections = {
+            "Conclusion": "Wrap up.",
+            "Model Calibration": "Key params here.",
+        }
+        result = LLMHandler._format_sections_for_prompt(sections, max_chars=10000)
+        model_pos = result.index("Model Calibration")
+        conclusion_pos = result.index("Conclusion")
+        assert model_pos < conclusion_pos
+
+    def test_empty_sections(self):
+        """Empty dict returns empty string."""
+        result = LLMHandler._format_sections_for_prompt({}, max_chars=10000)
+        assert result == ""
+
+
+class TestExtractKeyPassages:
+    """Tests for LLMHandler.extract_key_passages (Pass 1)."""
+
+    def _make_handler(self, mock_config):
+        with patch("quantcoder.core.llm.LLMFactory") as mock_factory:
+            mock_provider = MagicMock()
+            mock_provider.get_model_name.return_value = "mistral"
+            mock_provider.chat = AsyncMock(return_value="Test response")
+            mock_factory.create.return_value = mock_provider
+            handler = LLMHandler(mock_config)
+        return handler
+
+    def test_sends_full_sections(self, mock_config):
+        """Verify full section text is sent, not keyword-filtered snippets."""
+        handler = self._make_handler(mock_config)
+        sections = {"Methodology": "OU process with mean-reversion parameter theta=0.5"}
+
+        with patch("quantcoder.core.llm._run_async", return_value="[Methodology] \"OU process...\"") as mock_run:
+            result = handler.extract_key_passages(sections)
+
+        assert result is not None
+        # Verify _run_async was called and sections were in the prompt
+        call_args = mock_run.call_args
+        assert call_args is not None
+
+    def test_returns_none_on_empty_sections(self, mock_config):
+        """Empty sections dict returns None."""
+        handler = self._make_handler(mock_config)
+        result = handler.extract_key_passages({})
+        assert result is None
+
+    def test_returns_none_on_llm_failure(self, mock_config):
+        """LLM exception returns None."""
+        handler = self._make_handler(mock_config)
+        sections = {"Intro": "Some text."}
+
+        with patch("quantcoder.core.llm._run_async", side_effect=Exception("timeout")):
+            result = handler.extract_key_passages(sections)
+
+        assert result is None
+
+
+class TestInterpretStrategy:
+    """Tests for LLMHandler.interpret_strategy (Pass 2)."""
+
+    def _make_handler(self, mock_config):
+        with patch("quantcoder.core.llm.LLMFactory") as mock_factory:
+            mock_provider = MagicMock()
+            mock_provider.get_model_name.return_value = "mistral"
+            mock_provider.chat = AsyncMock(return_value="Test response")
+            mock_factory.create.return_value = mock_provider
+            handler = LLMHandler(mock_config)
+        return handler
+
+    def test_passes_extractions_to_llm(self, mock_config):
+        """Verify Pass 1 extractions are sent as input to Pass 2."""
+        handler = self._make_handler(mock_config)
+        extractions = '[Methodology] "theta = 0.5, half-life = 10 days"'
+
+        with patch("quantcoder.core.llm._run_async", return_value="## STRATEGY OVERVIEW\nMean reversion") as mock_run:
+            result = handler.interpret_strategy(extractions)
+
+        assert result is not None
+        assert "STRATEGY OVERVIEW" in result
+
+    def test_returns_none_on_empty_input(self, mock_config):
+        """Empty extractions returns None."""
+        handler = self._make_handler(mock_config)
+        assert handler.interpret_strategy("") is None
+        assert handler.interpret_strategy("   ") is None
+
+    def test_returns_none_on_llm_failure(self, mock_config):
+        """LLM exception returns None."""
+        handler = self._make_handler(mock_config)
+        with patch("quantcoder.core.llm._run_async", side_effect=Exception("timeout")):
+            result = handler.interpret_strategy("some extractions")
+        assert result is None
diff --git a/tests/test_processor.py b/tests/test_processor.py