Skip to content

Commit a67f4a6

Browse files
SL-Marclaude
andcommitted
Switch MinerU from dead UNIPipe API to CLI subprocess, fix section parser
- Replace magic_pdf.pipe.UNIPipe import with mineru CLI detection via _find_mineru() that checks PATH and venv bin directory - Rewrite MinerULoader.load_and_split() to use subprocess.run() with the mineru binary instead of the removed Python API - Fix _parse_markdown_sections() to handle any heading level (#, ##, ###) instead of only ## — MinerU outputs single-# headings - Tested end-to-end: 23 sections extracted from article_5.pdf with full LaTeX equation preservation Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent eaca9d9 commit a67f4a6

1 file changed

Lines changed: 45 additions & 23 deletions

File tree

quantcoder/core/processor.py

Lines changed: 45 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,23 @@
1313

1414
logger = logging.getLogger(__name__)
1515

16-
try:
17-
from magic_pdf.pipe.UNIPipe import UNIPipe
18-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
19-
MINERU_AVAILABLE = True
20-
except ImportError:
21-
MINERU_AVAILABLE = False
16+
import shutil
17+
import subprocess
18+
import sys
19+
20+
def _find_mineru() -> Optional[str]:
21+
"""Find the mineru binary on PATH or in the current venv."""
22+
path = shutil.which("mineru")
23+
if path:
24+
return path
25+
# Check inside the venv's bin directory (Claude Code shell may lack venv activation)
26+
venv_bin = Path(sys.prefix) / "bin" / "mineru"
27+
if venv_bin.is_file():
28+
return str(venv_bin)
29+
return None
30+
31+
_MINERU_PATH = _find_mineru()
32+
MINERU_AVAILABLE = _MINERU_PATH is not None
2233

2334

2435
class PDFLoader:
@@ -250,30 +261,38 @@ def __init__(self):
250261
self.logger = logging.getLogger(f"quantcoder.{self.__class__.__name__}")
251262

252263
def load_and_split(self, pdf_path: str) -> Dict[str, str]:
253-
"""Read PDF, run MinerU pipeline, return sections dict.
264+
"""Run MinerU CLI on *pdf_path*, return sections dict.
254265
255266
Returns:
256267
Dict mapping section headings to their text content,
257268
with LaTeX equations preserved as ``$...$`` / ``$$...$$``.
258269
"""
259270
self.logger.info(f"Loading PDF via MinerU: {pdf_path}")
260-
try:
261-
pdf_bytes = Path(pdf_path).read_bytes()
262-
except FileNotFoundError:
271+
pdf_path = str(Path(pdf_path).resolve())
272+
if not Path(pdf_path).exists():
263273
self.logger.error(f"PDF file not found: {pdf_path}")
264274
return {}
265-
except Exception as e:
266-
self.logger.error(f"Failed to read PDF: {e}")
267-
return {}
268275

269276
try:
270277
with tempfile.TemporaryDirectory() as tmp_dir:
271-
image_writer = DiskReaderWriter(tmp_dir)
272-
pipe = UNIPipe(pdf_bytes, {"_pdf_type": "", "model_list": []}, image_writer)
273-
pipe.pipe_classify()
274-
pipe.pipe_analyze()
275-
pipe.pipe_parse()
276-
md_content = pipe.pipe_mk_markdown(tmp_dir, drop_mode="none")
278+
result = subprocess.run(
279+
[_MINERU_PATH, "-p", pdf_path, "-o", tmp_dir, "-m", "auto"],
280+
capture_output=True, text=True, timeout=600,
281+
)
282+
if result.returncode != 0:
283+
self.logger.error(f"MinerU CLI failed: {result.stderr[:500]}")
284+
return {}
285+
286+
# MinerU outputs to <tmp_dir>/<stem>/hybrid_auto/<stem>.md
287+
stem = Path(pdf_path).stem
288+
md_candidates = list(Path(tmp_dir).rglob(f"{stem}.md"))
289+
if not md_candidates:
290+
self.logger.error("MinerU produced no markdown output")
291+
return {}
292+
md_content = md_candidates[0].read_text(encoding="utf-8")
293+
except subprocess.TimeoutExpired:
294+
self.logger.error("MinerU timed out after 600s")
295+
return {}
277296
except Exception as e:
278297
self.logger.error(f"MinerU pipeline failed: {e}")
279298
return {}
@@ -284,8 +303,9 @@ def load_and_split(self, pdf_path: str) -> Dict[str, str]:
284303

285304
@staticmethod
286305
def _parse_markdown_sections(md_content: str) -> Dict[str, str]:
287-
"""Split markdown on ``##`` headers into a sections dict.
306+
"""Split markdown on heading lines into a sections dict.
288307
308+
Handles any heading level (``#``, ``##``, ``###``, etc.).
289309
Text before the first heading is stored under ``"Introduction"``.
290310
LaTeX equations (``$...$``, ``$$...$$``) are preserved verbatim.
291311
@@ -298,18 +318,20 @@ def _parse_markdown_sections(md_content: str) -> Dict[str, str]:
298318
if not md_content or not md_content.strip():
299319
return {}
300320

321+
heading_re = re.compile(r'^(#{1,6})\s+(.+)$')
322+
301323
sections: Dict[str, str] = {}
302324
current_heading = "Introduction"
303325
current_lines: list[str] = []
304326

305327
for line in md_content.splitlines():
306-
stripped = line.strip()
307-
if stripped.startswith("## "):
328+
m = heading_re.match(line.strip())
329+
if m:
308330
# Store previous section
309331
body = "\n".join(current_lines).strip()
310332
if body:
311333
sections[current_heading] = body
312-
current_heading = stripped.lstrip("# ").strip()
334+
current_heading = m.group(2).strip()
313335
current_lines = []
314336
else:
315337
current_lines.append(line)

0 commit comments

Comments
 (0)