Skip to content

Commit 388c7fd

Browse files
SL-Marclaude
andcommitted
Add two-pass LLM summarization pipeline for high-fidelity strategy extraction
Replace the keyword-filter → rigid-template single-pass path with a two-pass pipeline: Pass 1 extracts verbatim quotes from full paper sections, Pass 2 converts them into a flexible strategy spec. KeywordAnalyzer kept as automatic fallback if either LLM pass fails. 38 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f45af16 commit 388c7fd

5 files changed

Lines changed: 417 additions & 19 deletions

File tree

quantcoder/core/llm.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,162 @@ def __init__(self, config):
7171
f"summary: {self._summary_llm.get_model_name()}"
7272
)
7373

74+
# -- Two-pass summarization helpers ----------------------------------
75+
76+
@staticmethod
77+
def _format_sections_for_prompt(
78+
sections: Dict[str, str], max_chars: int = 60000
79+
) -> str:
80+
"""Format paper sections into a single string within a token budget.
81+
82+
Prioritizes methodology-relevant sections and truncates low-priority
83+
ones (acknowledgments, references, appendix) when over budget.
84+
"""
85+
HIGH_PRIORITY_KEYWORDS = {
86+
"method", "model", "strategy", "trading", "signal", "algorithm",
87+
"approach", "result", "experiment", "implementation", "data",
88+
"feature", "regression", "portfolio", "backtest", "return",
89+
"risk", "parameter", "calibration", "estimation", "framework",
90+
}
91+
LOW_PRIORITY_KEYWORDS = {
92+
"acknowledg", "reference", "bibliography", "appendix", "vita",
93+
"disclosure", "funding", "supplementar",
94+
}
95+
96+
def _priority(name: str) -> int:
97+
lower = name.lower()
98+
if any(kw in lower for kw in LOW_PRIORITY_KEYWORDS):
99+
return 2
100+
if any(kw in lower for kw in HIGH_PRIORITY_KEYWORDS):
101+
return 0
102+
return 1
103+
104+
ordered = sorted(sections.items(), key=lambda kv: (_priority(kv[0]), kv[0]))
105+
106+
parts: list[str] = []
107+
total = 0
108+
for name, text in ordered:
109+
header = f"\n### {name}\n"
110+
available = max_chars - total - len(header)
111+
if available <= 0:
112+
break
113+
chunk = text[:available]
114+
parts.append(header + chunk)
115+
total += len(header) + len(chunk)
116+
117+
return "".join(parts)
118+
119+
def extract_key_passages(self, sections: Dict[str, str]) -> Optional[str]:
120+
"""Pass 1 — Extractive: quote verbatim passages relevant to implementation."""
121+
self.logger.info("Two-pass pipeline — Pass 1 (extract key passages)")
122+
123+
formatted = self._format_sections_for_prompt(sections)
124+
if not formatted.strip():
125+
self.logger.warning("No section text to send to LLM")
126+
return None
127+
128+
system = (
129+
"You are a quantitative finance research analyst. "
130+
"Read the paper sections below and QUOTE VERBATIM every passage that "
131+
"is relevant to implementing the described trading strategy or model. "
132+
"Do NOT paraphrase — copy the exact text. Do NOT skip passages because "
133+
"they use unfamiliar terms, novel formulas, or custom model names.\n\n"
134+
"Focus on:\n"
135+
"- Mathematical formulas and equations\n"
136+
"- Parameter values and calibration details\n"
137+
"- Entry and exit rules or signal generation logic\n"
138+
"- Risk controls, stop-loss rules, position sizing\n"
139+
"- Novel indicators or custom models (e.g., OU process, HMM, "
140+
"regime-switching, proprietary scores)\n"
141+
"- Universe selection and data requirements\n"
142+
"- Execution details (order types, rebalancing frequency)\n\n"
143+
"Output format: a numbered list of verbatim quotes, each preceded by "
144+
"the section name in brackets. Example:\n"
145+
"[Methodology] \"We define the signal as ...\"\n"
146+
)
147+
148+
prompt = f"Extract all implementable passages from this paper:\n{formatted}"
149+
150+
try:
151+
messages = [
152+
{"role": "system", "content": system},
153+
{"role": "user", "content": prompt},
154+
]
155+
result = _run_async(
156+
self._summary_llm.chat(
157+
messages=messages, max_tokens=4096, temperature=0.1
158+
)
159+
)
160+
self.logger.info(
161+
f"Pass 1 complete — extracted {len(result)} chars of passages"
162+
)
163+
return result
164+
except Exception as e:
165+
self.logger.error(f"Pass 1 (extract_key_passages) failed: {e}")
166+
return None
167+
168+
def interpret_strategy(self, extractions: str) -> Optional[str]:
169+
"""Pass 2 — Interpretive: convert verbatim quotes into strategy spec."""
170+
self.logger.info("Two-pass pipeline — Pass 2 (interpret strategy)")
171+
172+
if not extractions or not extractions.strip():
173+
self.logger.warning("Empty extractions — nothing to interpret")
174+
return None
175+
176+
system = (
177+
"You are a quantitative trading strategist. Convert the verbatim "
178+
"paper quotes below into a precise, implementable strategy "
179+
"specification. Base your output ONLY on what the quotes say. "
180+
"If the paper uses an OU process, specify an OU process — do NOT "
181+
"substitute RSI or SMA. If the paper describes a proprietary model "
182+
"or custom indicator, describe it faithfully.\n\n"
183+
"Use the following flexible structure. Skip any section that is "
184+
"genuinely irrelevant to this particular strategy:\n\n"
185+
"## STRATEGY OVERVIEW\n"
186+
"One paragraph summarizing the core idea.\n\n"
187+
"## MATHEMATICAL MODEL\n"
188+
"Formulas, distributions, state dynamics — as described in the paper.\n\n"
189+
"## SIGNAL GENERATION\n"
190+
"Exact entry/exit conditions with numeric thresholds.\n\n"
191+
"## EXIT RULES\n"
192+
"Stop loss, profit target, time stop, trailing stop — with exact values.\n\n"
193+
"## RISK MANAGEMENT\n"
194+
"Position sizing, max exposure, drawdown limits.\n\n"
195+
"## UNIVERSE / STOCK SELECTION\n"
196+
"Market, filters, number of instruments.\n\n"
197+
"## EXECUTION DETAILS\n"
198+
"Order types, rebalancing frequency, data resolution.\n\n"
199+
"## PARAMETER TABLE\n"
200+
"| Parameter | Value | Source |\n"
201+
"|-----------|-------|--------|\n"
202+
"Every numeric parameter from the paper with source attribution.\n"
203+
)
204+
205+
prompt = (
206+
"Convert these verbatim paper extractions into an implementable "
207+
f"strategy specification:\n\n{extractions}"
208+
)
209+
210+
try:
211+
messages = [
212+
{"role": "system", "content": system},
213+
{"role": "user", "content": prompt},
214+
]
215+
result = _run_async(
216+
self._summary_llm.chat(
217+
messages=messages, max_tokens=4096, temperature=0.3
218+
)
219+
)
220+
self.logger.info(
221+
f"Pass 2 complete — strategy spec is {len(result)} chars"
222+
)
223+
return result
224+
except Exception as e:
225+
self.logger.error(f"Pass 2 (interpret_strategy) failed: {e}")
226+
return None
227+
228+
# -- Legacy single-pass summary (kept intact) -------------------------
229+
74230
def generate_summary(self, extracted_data: Dict[str, List[str]]) -> Optional[str]:
75231
"""Generate a structured trading strategy summary for algorithm generation."""
76232
self.logger.info("Generating summary")

quantcoder/core/processor.py

Lines changed: 65 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,33 @@ def __init__(self, config, max_refine_attempts: int = 6):
248248
self.llm_handler = LLMHandler(config)
249249
self.max_refine_attempts = max_refine_attempts
250250

251+
def extract_sections(self, pdf_path: str) -> Dict[str, str]:
252+
"""Extract paper sections from PDF (no keyword filtering).
253+
254+
Returns:
255+
Dict mapping section names to their full text.
256+
"""
257+
self.logger.info(f"Extracting sections from PDF: {pdf_path}")
258+
259+
raw_text = self.pdf_loader.load_pdf(pdf_path)
260+
if not raw_text:
261+
self.logger.error("No text extracted from PDF")
262+
return {}
263+
264+
preprocessed_text = self.preprocessor.preprocess_text(raw_text)
265+
if not preprocessed_text:
266+
self.logger.error("Preprocessing failed")
267+
return {}
268+
269+
headings = self.heading_detector.detect_headings(preprocessed_text)
270+
if not headings:
271+
self.logger.warning("No headings detected. Using default sectioning")
272+
273+
sections = self.section_splitter.split_into_sections(preprocessed_text, headings)
274+
return dict(sections)
275+
251276
def extract_structure(self, pdf_path: str) -> Dict[str, List[str]]:
252-
"""Extract structured data from PDF."""
277+
"""Extract structured data from PDF (legacy keyword-filtered path)."""
253278
self.logger.info(f"Starting extraction for PDF: {pdf_path}")
254279

255280
raw_text = self.pdf_loader.load_pdf(pdf_path)
@@ -272,20 +297,51 @@ def extract_structure(self, pdf_path: str) -> Dict[str, List[str]]:
272297
return keyword_analysis
273298

274299
def generate_summary(self, extracted_data: Dict[str, List[str]]) -> Optional[str]:
275-
"""Generate summary from extracted data."""
300+
"""Generate summary from extracted data (legacy single-pass)."""
276301
return self.llm_handler.generate_summary(extracted_data)
277302

278-
def extract_structure_and_generate_code(self, pdf_path: str) -> Dict:
279-
"""Extract structure and generate QuantConnect code."""
280-
self.logger.info("Starting extraction and code generation")
303+
def generate_two_pass_summary(self, pdf_path: str) -> Optional[str]:
304+
"""Two-pass LLM summarization: extract then interpret.
305+
306+
Falls back to the legacy keyword-filtered path if either LLM pass fails.
307+
"""
308+
self.logger.info("Starting two-pass summarization pipeline")
309+
310+
# Step 1 — get full sections (no keyword filter)
311+
sections = self.extract_sections(pdf_path)
312+
if not sections:
313+
self.logger.warning("No sections extracted, falling back to legacy path")
314+
return self._legacy_summarize(pdf_path)
315+
316+
# Step 2 — Pass 1: extract verbatim quotes
317+
extractions = self.llm_handler.extract_key_passages(sections)
318+
if not extractions:
319+
self.logger.warning("Pass 1 failed, falling back to legacy path")
320+
return self._legacy_summarize(pdf_path)
321+
322+
# Step 3 — Pass 2: interpret into strategy spec
323+
summary = self.llm_handler.interpret_strategy(extractions)
324+
if not summary:
325+
self.logger.warning("Pass 2 failed, falling back to legacy path")
326+
return self._legacy_summarize(pdf_path)
327+
328+
self.logger.info("Two-pass summarization complete")
329+
return summary
281330

331+
def _legacy_summarize(self, pdf_path: str) -> Optional[str]:
332+
"""Legacy single-pass summarization via KeywordAnalyzer + rigid template."""
333+
self.logger.info("Using legacy summarization path")
282334
extracted_data = self.extract_structure(pdf_path)
283335
if not extracted_data:
284-
self.logger.error("No data extracted for code generation")
285-
return {"summary": None, "code": None}
336+
return None
337+
return self.llm_handler.generate_summary(extracted_data)
338+
339+
def extract_structure_and_generate_code(self, pdf_path: str) -> Dict:
340+
"""Extract structure and generate QuantConnect code."""
341+
self.logger.info("Starting extraction and code generation")
286342

287-
# Generate summary
288-
summary = self.llm_handler.generate_summary(extracted_data)
343+
# Use two-pass pipeline (with automatic legacy fallback)
344+
summary = self.generate_two_pass_summary(pdf_path)
289345
if not summary:
290346
self.logger.error("Failed to generate summary")
291347
summary = "Summary could not be generated."

quantcoder/tools/article_tools.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -400,15 +400,8 @@ def execute(self, article_ids: List[int]) -> ToolResult:
400400
# Get article metadata
401401
article_meta = articles[article_id - 1]
402402

403-
# Process the article
404-
extracted_data = processor.extract_structure(str(filepath))
405-
406-
if not extracted_data:
407-
self.logger.warning(f"Failed to extract data from article {article_id}")
408-
continue
409-
410-
# Generate summary
411-
summary_text = processor.generate_summary(extracted_data)
403+
# Process the article (two-pass pipeline with legacy fallback)
404+
summary_text = processor.generate_two_pass_summary(str(filepath))
412405

413406
if not summary_text:
414407
self.logger.warning(f"Failed to generate summary for article {article_id}")

tests/test_llm.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,121 @@ def test_strip_markdown_no_fence(self):
106106
"""Test static _strip_markdown without fence."""
107107
text = "def test():\n pass"
108108
assert LLMHandler._strip_markdown(text) == text
109+
110+
111+
class TestFormatSectionsForPrompt:
112+
"""Tests for LLMHandler._format_sections_for_prompt."""
113+
114+
def test_under_budget(self):
115+
"""All sections fit within budget."""
116+
sections = {"Methodology": "Some methods.", "Results": "Some results."}
117+
result = LLMHandler._format_sections_for_prompt(sections, max_chars=10000)
118+
assert "Methodology" in result
119+
assert "Results" in result
120+
121+
def test_over_budget_truncates_low_priority(self):
122+
"""Low-priority sections are excluded when budget is tight."""
123+
sections = {
124+
"Trading Strategy": "A" * 500,
125+
"References": "B" * 500,
126+
}
127+
# Budget only allows ~550 chars total (header + content)
128+
result = LLMHandler._format_sections_for_prompt(sections, max_chars=550)
129+
assert "Trading Strategy" in result
130+
# References may be partially included or excluded
131+
assert len(result) <= 600 # some tolerance for headers
132+
133+
def test_high_priority_sections_first(self):
134+
"""High-priority sections appear before medium-priority ones."""
135+
sections = {
136+
"Conclusion": "Wrap up.",
137+
"Model Calibration": "Key params here.",
138+
}
139+
result = LLMHandler._format_sections_for_prompt(sections, max_chars=10000)
140+
model_pos = result.index("Model Calibration")
141+
conclusion_pos = result.index("Conclusion")
142+
assert model_pos < conclusion_pos
143+
144+
def test_empty_sections(self):
145+
"""Empty dict returns empty string."""
146+
result = LLMHandler._format_sections_for_prompt({}, max_chars=10000)
147+
assert result == ""
148+
149+
150+
class TestExtractKeyPassages:
151+
"""Tests for LLMHandler.extract_key_passages (Pass 1)."""
152+
153+
def _make_handler(self, mock_config):
154+
with patch("quantcoder.core.llm.LLMFactory") as mock_factory:
155+
mock_provider = MagicMock()
156+
mock_provider.get_model_name.return_value = "mistral"
157+
mock_provider.chat = AsyncMock(return_value="Test response")
158+
mock_factory.create.return_value = mock_provider
159+
handler = LLMHandler(mock_config)
160+
return handler
161+
162+
def test_sends_full_sections(self, mock_config):
163+
"""Verify full section text is sent, not keyword-filtered snippets."""
164+
handler = self._make_handler(mock_config)
165+
sections = {"Methodology": "OU process with mean-reversion parameter theta=0.5"}
166+
167+
with patch("quantcoder.core.llm._run_async", return_value="[Methodology] \"OU process...\"") as mock_run:
168+
result = handler.extract_key_passages(sections)
169+
170+
assert result is not None
171+
# Verify _run_async was called and sections were in the prompt
172+
call_args = mock_run.call_args
173+
assert call_args is not None
174+
175+
def test_returns_none_on_empty_sections(self, mock_config):
176+
"""Empty sections dict returns None."""
177+
handler = self._make_handler(mock_config)
178+
result = handler.extract_key_passages({})
179+
assert result is None
180+
181+
def test_returns_none_on_llm_failure(self, mock_config):
182+
"""LLM exception returns None."""
183+
handler = self._make_handler(mock_config)
184+
sections = {"Intro": "Some text."}
185+
186+
with patch("quantcoder.core.llm._run_async", side_effect=Exception("timeout")):
187+
result = handler.extract_key_passages(sections)
188+
189+
assert result is None
190+
191+
192+
class TestInterpretStrategy:
193+
"""Tests for LLMHandler.interpret_strategy (Pass 2)."""
194+
195+
def _make_handler(self, mock_config):
196+
with patch("quantcoder.core.llm.LLMFactory") as mock_factory:
197+
mock_provider = MagicMock()
198+
mock_provider.get_model_name.return_value = "mistral"
199+
mock_provider.chat = AsyncMock(return_value="Test response")
200+
mock_factory.create.return_value = mock_provider
201+
handler = LLMHandler(mock_config)
202+
return handler
203+
204+
def test_passes_extractions_to_llm(self, mock_config):
205+
"""Verify Pass 1 extractions are sent as input to Pass 2."""
206+
handler = self._make_handler(mock_config)
207+
extractions = '[Methodology] "theta = 0.5, half-life = 10 days"'
208+
209+
with patch("quantcoder.core.llm._run_async", return_value="## STRATEGY OVERVIEW\nMean reversion") as mock_run:
210+
result = handler.interpret_strategy(extractions)
211+
212+
assert result is not None
213+
assert "STRATEGY OVERVIEW" in result
214+
215+
def test_returns_none_on_empty_input(self, mock_config):
216+
"""Empty extractions returns None."""
217+
handler = self._make_handler(mock_config)
218+
assert handler.interpret_strategy("") is None
219+
assert handler.interpret_strategy(" ") is None
220+
221+
def test_returns_none_on_llm_failure(self, mock_config):
222+
"""LLM exception returns None."""
223+
handler = self._make_handler(mock_config)
224+
with patch("quantcoder.core.llm._run_async", side_effect=Exception("timeout")):
225+
result = handler.interpret_strategy("some extractions")
226+
assert result is None

0 commit comments

Comments
 (0)