44from pathlib import Path
55
66
7+ def _tag_has_exact_class (tag , * target_classes : str ) -> bool :
8+ """Check whether a BS4 tag has any of the given class names."""
9+ classes = tag .get ("class" , [])
10+ if isinstance (classes , str ):
11+ classes = classes .split ()
12+ return any (cls in target_classes for cls in classes )
13+
14+
715def is_url (s : str ) -> bool :
816 """Check if a string looks like a URL."""
917 return s .startswith ("http://" ) or s .startswith ("https://" )
@@ -38,14 +46,18 @@ def parse_document(file_path: str | Path) -> tuple[str, str]:
3846
3947
4048def _parse_pdf (path : Path ) -> tuple [str , str ]:
41- """Extract text from PDF. Uses Marker CLI if available (better math), else pymupdf."""
49+ """Extract text from PDF.
50+
51+ Parser chain:
52+ 1. Marker — best math + table quality, requires heavy ML deps
53+ 2. pymupdf4llm — correct reading order and tables via GNN layout (default)
54+ """
4255 try :
4356 return _parse_pdf_marker (path )
4457 except (ImportError , FileNotFoundError , RuntimeError ) as e :
45- print (f" Marker not available ({ e } ), using pymupdf fallback." )
46- print (" Note: pymupdf cannot extract math symbols correctly. "
47- "For math-heavy PDFs, use .tex source or arXiv HTML." )
48- return _parse_pdf_pymupdf (path )
58+ print (f" Marker not available ({ e } ), trying pymupdf4llm..." )
59+
60+ return _parse_pdf_pymupdf4llm (path )
4961
5062
5163def _parse_pdf_marker (path : Path ) -> tuple [str , str ]:
@@ -102,76 +114,62 @@ def _parse_pdf_marker(path: Path) -> tuple[str, str]:
102114 return title , markdown
103115
104116
105- def _extract_title_from_markdown ( markdown : str ) -> str :
106- """Extract the first heading from markdown text as the title."""
107- for line in markdown . split ( " \n " ):
108- stripped = line . strip ()
109- if stripped . startswith ( "#" ):
110- return stripped . lstrip ( "# " ). strip ()
111- # Fallback: first non-empty line
112- for line in markdown . split ( " \n " ) :
113- if line . strip ():
114- return line . strip ()[: 200 ]
115- return ""
117+ def _parse_pdf_pymupdf4llm ( path : Path ) -> tuple [ str , str ] :
118+ """PDF extraction using pymupdf4llm with GNN layout analysis.
119+
120+ Fixes hyphenation, reading order, and table structure vs raw pymupdf.
121+ pymupdf-layout activates automatically when installed, enabling GNN-based
122+ table detection. Both packages are required dependencies.
123+ """
124+ try :
125+ import pymupdf . layout # noqa: F401 — activates layout plugin
126+ except ImportError :
127+ pass # layout plugin missing; pymupdf4llm still works, just without GNN
116128
129+ import pymupdf4llm
117130
118- def _parse_pdf_pymupdf (path : Path ) -> tuple [str , str ]:
119- """Fallback PDF extraction using pymupdf (no math support)."""
120- import pymupdf
131+ markdown = pymupdf4llm .to_markdown (str (path ))
132+ markdown = _clean_pymupdf4llm_markdown (markdown )
133+ title = _extract_title_from_markdown (markdown )
134+ return title , markdown
121135
122- doc = pymupdf .open (str (path ))
123- pages = []
124- title = ""
125136
126- for page_num , page in enumerate (doc ):
127- text = page .get_text ()
128- pages .append (text )
129-
130- if page_num == 0 and not title :
131- blocks = page .get_text ("dict" )["blocks" ]
132- best_size = 0
133- for block in blocks :
134- if "lines" not in block :
135- continue
136- for line in block ["lines" ]:
137- for span in line ["spans" ]:
138- if span ["text" ].strip () and span ["size" ] > best_size :
139- best_size = span ["size" ]
140-
141- if best_size > 0 :
142- candidates = []
143- current_parts = []
144- for block in blocks :
145- if "lines" not in block :
146- if current_parts :
147- candidates .append (" " .join (current_parts ))
148- current_parts = []
149- continue
150- for line in block ["lines" ]:
151- for span in line ["spans" ]:
152- span_text = span ["text" ].strip ()
153- if not span_text :
154- continue
155- if abs (span ["size" ] - best_size ) < 0.5 :
156- current_parts .append (span_text )
157- elif current_parts :
158- candidates .append (" " .join (current_parts ))
159- current_parts = []
160- if current_parts :
161- candidates .append (" " .join (current_parts ))
162- if candidates :
163- title = max (candidates , key = len )
164-
165- doc .close ()
166- full_text = "\n \n " .join (pages )
137+ def _clean_pymupdf4llm_markdown (md : str ) -> str :
138+ """Post-process pymupdf4llm markdown for cleaner LLM input.
167139
168- if not title :
169- for line in full_text .split ("\n " ):
170- if line .strip ():
171- title = line .strip ()[:200 ]
172- break
140+ - Strips the noisy '==> picture [WxH] intentionally omitted <==' lines.
141+ The pixel dimensions are meaningless and the phrasing distracts the LLM.
142+ Embedded figure text (chart labels, diagram text) and captions are kept.
143+ - Converts inline <br> separators in embedded text to newlines.
144+ """
145+ out = []
146+ for line in md .split ("\n " ):
147+ # Drop picture placeholder lines (with or without bold **)
148+ if "intentionally omitted" in line :
149+ stripped = line .strip ().strip ("*" ).strip ()
150+ if stripped .startswith ("==>" ):
151+ continue
152+ # Clean up <br> in embedded figure text lines
153+ if "<br>" in line :
154+ line = line .replace ("<br>" , "\n " )
155+ out .append (line )
156+ return "\n " .join (out )
173157
174- return title , full_text
158+
159+ def _extract_title_from_markdown (markdown : str ) -> str :
160+ """Extract the first heading from markdown text as the title."""
161+ fallback = ""
162+ for line in markdown .split ("\n " ):
163+ stripped = line .strip ()
164+ if not stripped :
165+ continue
166+ if stripped .startswith ("#" ):
167+ title = stripped .lstrip ("# " ).strip ()
168+ # Strip bold markers that pymupdf4llm adds to headings
169+ return re .sub (r"\*\*(.+?)\*\*" , r"\1" , title )
170+ if not fallback :
171+ fallback = stripped [:200 ]
172+ return fallback
175173
176174
177175def _parse_docx (path : Path ) -> tuple [str , str ]:
@@ -280,11 +278,69 @@ def _fetch_arxiv_pdf(url: str) -> tuple[str, str]:
280278 tmp_path .unlink (missing_ok = True )
281279
282280
281+ def _tabular_to_markdown (table_el ) -> str :
282+ """Convert a BS4 ltx_tabular element to a markdown table."""
283+ rows = []
284+ for tr in table_el .find_all ("tr" ):
285+ cells = tr .find_all (["td" , "th" ])
286+ if not cells :
287+ continue
288+ row = [
289+ cell .get_text (" " , strip = True ).replace ("|" , r"\|" ).replace ("\n " , " " )
290+ for cell in cells
291+ ]
292+ rows .append (row )
293+
294+ if not rows :
295+ return ""
296+
297+ ncols = max (len (r ) for r in rows )
298+
299+ def pad (r ):
300+ return r + ["" ] * (ncols - len (r ))
301+
302+ lines = ["| " + " | " .join (pad (rows [0 ])) + " |" ,
303+ "| " + " | " .join (["---" ] * ncols ) + " |" ]
304+ for row in rows [1 :]:
305+ lines .append ("| " + " | " .join (pad (row )[:ncols ]) + " |" )
306+ return "\n " .join (lines )
307+
308+
309+ def _figure_or_table_to_markdown (fig_el ) -> str :
310+ """Convert an ltx_figure or ltx_table element to markdown text.
311+
312+ Tables become markdown tables (with caption above).
313+ Image figures are reduced to caption text only.
314+ """
315+ caption_el = fig_el .find (class_ = "ltx_caption" )
316+ caption = (caption_el .get_text (" " , strip = True ) if caption_el else "" ).replace ("\n " , " " )
317+
318+ # Table figure: contains ltx_tabular
319+ tabular = fig_el .find (class_ = "ltx_tabular" )
320+ if tabular :
321+ table_md = _tabular_to_markdown (tabular )
322+ if not table_md :
323+ return f"**{ caption } **" if caption else ""
324+ return (f"**{ caption } **\n \n { table_md } " if caption else table_md )
325+
326+ # Image figure: collect main graphics, skip tiny caption icons
327+ imgs = [
328+ img for img in fig_el .find_all ("img" , class_ = "ltx_graphics" )
329+ if not (caption_el and caption_el .find (lambda t : t is img ))
330+ and int (img .get ("width" , "100" ) or "100" ) >= 30
331+ ]
332+ if not imgs :
333+ return f"**{ caption } **" if caption else ""
334+
335+ return f"*{ caption } *" if caption else ""
336+
337+
283338def parse_arxiv_html (url : str ) -> tuple [str , str ]:
284339 """Fetch and parse an arXiv HTML page into (title, full_text).
285340
286341 Works with arXiv HTML URLs like https://arxiv.org/html/2310.06825.
287342 The HTML is generated by LaTeXML and uses ltx_* CSS classes.
343+ Tables are converted to markdown tables; figures keep caption text only.
288344 """
289345 from urllib .error import URLError
290346 from urllib .request import Request , urlopen
@@ -323,14 +379,35 @@ def parse_arxiv_html(url: str) -> tuple[str, str]:
323379 for el in doc .select (sel ):
324380 el .decompose ()
325381
382+ # Pre-process figures and tables: convert to markdown and replace with
383+ # ltx_para marker divs so they appear at the correct position in the flow.
384+ # Use exact class match (not substring) to avoid matching ltx_figure_panel etc.
385+ inserted_markers = False
386+ for fig in doc .find_all (lambda tag : _tag_has_exact_class (tag , "ltx_figure" , "ltx_table" )):
387+ md = _figure_or_table_to_markdown (fig )
388+ if md :
389+ marker = soup .new_tag ("div" )
390+ marker ["class" ] = "ltx_para"
391+ marker ["data-oar-content" ] = md
392+ fig .replace_with (marker )
393+ inserted_markers = True
394+ else :
395+ fig .decompose ()
396+
326397 # Extract structured text using leaf content elements only.
327398 # ltx_para = paragraph text, ltx_title_* = headings, ltx_abstract = abstract,
328- # ltx_theorem/ltx_proof = theorems, ltx_caption = figure captions .
399+ # ltx_theorem/ltx_proof = theorems. Captions are now handled via figure pre-processing .
329400 # We do NOT match ltx_section/ltx_subsection (containers that include all children).
330401 sections = []
331402 for element in doc .find_all (class_ = re .compile (
332- r"^ltx_(para$|title_|abstract$|theorem$|proof$|caption )"
403+ r"^ltx_(para$|title_|abstract$|theorem$|proof$)"
333404 )):
405+ # Figure/table markers: use pre-computed markdown directly
406+ oar_content = element .get ("data-oar-content" )
407+ if oar_content is not None :
408+ sections .append (oar_content )
409+ continue
410+
334411 text = element .get_text (" " , strip = True )
335412 if not text :
336413 continue
@@ -352,7 +429,7 @@ def parse_arxiv_html(url: str) -> tuple[str, str]:
352429 # Skip — already handled by ltx_abstract match
353430 continue
354431 elif cls_str .startswith ("ltx_title" ):
355- # Other titles (theorem, proof, caption, etc.)
432+ # Other titles (theorem, proof, etc.)
356433 sections .append (f"\n **{ text } **" )
357434 elif "ltx_abstract" in cls_str :
358435 # Extract just paragraph text, skip the title child
@@ -370,7 +447,7 @@ def parse_arxiv_html(url: str) -> tuple[str, str]:
370447 full_text = "\n \n " .join (sections )
371448
372449 # Fallback: if structured extraction got very little, use plain text
373- if len (full_text ) < 500 :
450+ if len (full_text ) < 500 and not inserted_markers :
374451 full_text = doc .get_text ("\n " , strip = True )
375452
376453 if not title :
0 commit comments