1313
1414logger = logging .getLogger (__name__ )
1515
16- try :
17- from magic_pdf .pipe .UNIPipe import UNIPipe
18- from magic_pdf .rw .DiskReaderWriter import DiskReaderWriter
19- MINERU_AVAILABLE = True
20- except ImportError :
21- MINERU_AVAILABLE = False
16+ import shutil
17+ import subprocess
18+ import sys
19+
20+ def _find_mineru () -> Optional [str ]:
21+ """Find the mineru binary on PATH or in the current venv."""
22+ path = shutil .which ("mineru" )
23+ if path :
24+ return path
25+ # Check inside the venv's bin directory (Claude Code shell may lack venv activation)
26+ venv_bin = Path (sys .prefix ) / "bin" / "mineru"
27+ if venv_bin .is_file ():
28+ return str (venv_bin )
29+ return None
30+
31+ _MINERU_PATH = _find_mineru ()
32+ MINERU_AVAILABLE = _MINERU_PATH is not None
2233
2334
2435class PDFLoader :
@@ -250,30 +261,38 @@ def __init__(self):
250261 self .logger = logging .getLogger (f"quantcoder.{ self .__class__ .__name__ } " )
251262
252263 def load_and_split (self , pdf_path : str ) -> Dict [str , str ]:
253- """Read PDF, run MinerU pipeline , return sections dict.
264+ """Run MinerU CLI on *pdf_path* , return sections dict.
254265
255266 Returns:
256267 Dict mapping section headings to their text content,
257268 with LaTeX equations preserved as ``$...$`` / ``$$...$$``.
258269 """
259270 self .logger .info (f"Loading PDF via MinerU: { pdf_path } " )
260- try :
261- pdf_bytes = Path (pdf_path ).read_bytes ()
262- except FileNotFoundError :
271+ pdf_path = str (Path (pdf_path ).resolve ())
272+ if not Path (pdf_path ).exists ():
263273 self .logger .error (f"PDF file not found: { pdf_path } " )
264274 return {}
265- except Exception as e :
266- self .logger .error (f"Failed to read PDF: { e } " )
267- return {}
268275
269276 try :
270277 with tempfile .TemporaryDirectory () as tmp_dir :
271- image_writer = DiskReaderWriter (tmp_dir )
272- pipe = UNIPipe (pdf_bytes , {"_pdf_type" : "" , "model_list" : []}, image_writer )
273- pipe .pipe_classify ()
274- pipe .pipe_analyze ()
275- pipe .pipe_parse ()
276- md_content = pipe .pipe_mk_markdown (tmp_dir , drop_mode = "none" )
278+ result = subprocess .run (
279+ [_MINERU_PATH , "-p" , pdf_path , "-o" , tmp_dir , "-m" , "auto" ],
280+ capture_output = True , text = True , timeout = 600 ,
281+ )
282+ if result .returncode != 0 :
283+ self .logger .error (f"MinerU CLI failed: { result .stderr [:500 ]} " )
284+ return {}
285+
286+ # MinerU outputs to <tmp_dir>/<stem>/hybrid_auto/<stem>.md
287+ stem = Path (pdf_path ).stem
288+ md_candidates = list (Path (tmp_dir ).rglob (f"{ stem } .md" ))
289+ if not md_candidates :
290+ self .logger .error ("MinerU produced no markdown output" )
291+ return {}
292+ md_content = md_candidates [0 ].read_text (encoding = "utf-8" )
293+ except subprocess .TimeoutExpired :
294+ self .logger .error ("MinerU timed out after 600s" )
295+ return {}
277296 except Exception as e :
278297 self .logger .error (f"MinerU pipeline failed: { e } " )
279298 return {}
@@ -284,8 +303,9 @@ def load_and_split(self, pdf_path: str) -> Dict[str, str]:
284303
285304 @staticmethod
286305 def _parse_markdown_sections (md_content : str ) -> Dict [str , str ]:
287- """Split markdown on ``##`` headers into a sections dict.
306+ """Split markdown on heading lines into a sections dict.
288307
308+ Handles any heading level (``#``, ``##``, ``###``, etc.).
289309 Text before the first heading is stored under ``"Introduction"``.
290310 LaTeX equations (``$...$``, ``$$...$$``) are preserved verbatim.
291311
@@ -298,18 +318,20 @@ def _parse_markdown_sections(md_content: str) -> Dict[str, str]:
298318 if not md_content or not md_content .strip ():
299319 return {}
300320
321+ heading_re = re .compile (r'^(#{1,6})\s+(.+)$' )
322+
301323 sections : Dict [str , str ] = {}
302324 current_heading = "Introduction"
303325 current_lines : list [str ] = []
304326
305327 for line in md_content .splitlines ():
306- stripped = line .strip ()
307- if stripped . startswith ( "## " ) :
328+ m = heading_re . match ( line .strip () )
329+ if m :
308330 # Store previous section
309331 body = "\n " .join (current_lines ).strip ()
310332 if body :
311333 sections [current_heading ] = body
312- current_heading = stripped . lstrip ( "# " ).strip ()
334+ current_heading = m . group ( 2 ).strip ()
313335 current_lines = []
314336 else :
315337 current_lines .append (line )
0 commit comments