Skip to content

Commit 4955daa

Browse files
authored
Merge pull request #42 from ChicagoHAI/feat/table-figure-parsing
feat(parser): improve PDF parsing and arXiv HTML extraction
2 parents 9f83762 + 58f18fd commit 4955daa

8 files changed

Lines changed: 408 additions & 89 deletions

File tree

CLAUDE.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
### Environment
44
- Python 3.12. Install with `pip install -e .` (or `pip install .`).
5-
- Dependencies: `openai`, `tiktoken`, `python-dotenv`, `pymupdf`, `python-docx`.
6-
- Dev dependencies (for benchmarks): `beautifulsoup4`, `lxml`. Install with `pip install -e ".[dev]"`.
5+
- Dependencies: `openai`, `tiktoken`, `python-dotenv`, `pymupdf`, `pymupdf4llm`, `pymupdf-layout`, `python-docx`, `beautifulsoup4`, `lxml`.
6+
- Dev dependencies (for benchmarks): `pytest`. Install with `pip install -e ".[dev]"`.
77
- API key and model overrides in `.env` (see `.env.example`).
88

99
### Package (`src/reviewer/`)

examples/review_results/260218458v1_skill.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"slug": "260218458_skill",
2+
"slug": "260218458v1_skill",
33
"title": "The Story is Not the Science:Execution-Grounded Evaluation of\nMechanistic Interpretability Research",
44
"paragraphs": [
55
{

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ dependencies = [
1414
"python-dotenv>=1.2.1",
1515
"tiktoken>=0.12.0",
1616
"pymupdf>=1.24.0",
17+
"pymupdf4llm>=1.27.0",
18+
"pymupdf-layout>=1.27.0",
1719
"python-docx>=1.1.0",
1820
"beautifulsoup4>=4.12.0",
1921
"lxml>=5.0.0",

src/reviewer/parsers.py

Lines changed: 150 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@
44
from pathlib import Path
55

66

7+
def _tag_has_exact_class(tag, *target_classes: str) -> bool:
8+
"""Check whether a BS4 tag has any of the given class names."""
9+
classes = tag.get("class", [])
10+
if isinstance(classes, str):
11+
classes = classes.split()
12+
return any(cls in target_classes for cls in classes)
13+
14+
715
def is_url(s: str) -> bool:
816
"""Check if a string looks like a URL."""
917
return s.startswith("http://") or s.startswith("https://")
@@ -38,14 +46,18 @@ def parse_document(file_path: str | Path) -> tuple[str, str]:
3846

3947

4048
def _parse_pdf(path: Path) -> tuple[str, str]:
41-
"""Extract text from PDF. Uses Marker CLI if available (better math), else pymupdf."""
49+
"""Extract text from PDF.
50+
51+
Parser chain:
52+
1. Marker — best math + table quality, requires heavy ML deps
53+
2. pymupdf4llm — correct reading order and tables via GNN layout (default)
54+
"""
4255
try:
4356
return _parse_pdf_marker(path)
4457
except (ImportError, FileNotFoundError, RuntimeError) as e:
45-
print(f" Marker not available ({e}), using pymupdf fallback.")
46-
print(" Note: pymupdf cannot extract math symbols correctly. "
47-
"For math-heavy PDFs, use .tex source or arXiv HTML.")
48-
return _parse_pdf_pymupdf(path)
58+
print(f" Marker not available ({e}), trying pymupdf4llm...")
59+
60+
return _parse_pdf_pymupdf4llm(path)
4961

5062

5163
def _parse_pdf_marker(path: Path) -> tuple[str, str]:
@@ -102,76 +114,62 @@ def _parse_pdf_marker(path: Path) -> tuple[str, str]:
102114
return title, markdown
103115

104116

105-
def _extract_title_from_markdown(markdown: str) -> str:
106-
"""Extract the first heading from markdown text as the title."""
107-
for line in markdown.split("\n"):
108-
stripped = line.strip()
109-
if stripped.startswith("#"):
110-
return stripped.lstrip("# ").strip()
111-
# Fallback: first non-empty line
112-
for line in markdown.split("\n"):
113-
if line.strip():
114-
return line.strip()[:200]
115-
return ""
117+
def _parse_pdf_pymupdf4llm(path: Path) -> tuple[str, str]:
118+
"""PDF extraction using pymupdf4llm with GNN layout analysis.
119+
120+
Fixes hyphenation, reading order, and table structure vs raw pymupdf.
121+
pymupdf-layout activates automatically when installed, enabling GNN-based
122+
table detection. Both packages are required dependencies.
123+
"""
124+
try:
125+
import pymupdf.layout # noqa: F401 — activates layout plugin
126+
except ImportError:
127+
pass # layout plugin missing; pymupdf4llm still works, just without GNN
116128

129+
import pymupdf4llm
117130

118-
def _parse_pdf_pymupdf(path: Path) -> tuple[str, str]:
119-
"""Fallback PDF extraction using pymupdf (no math support)."""
120-
import pymupdf
131+
markdown = pymupdf4llm.to_markdown(str(path))
132+
markdown = _clean_pymupdf4llm_markdown(markdown)
133+
title = _extract_title_from_markdown(markdown)
134+
return title, markdown
121135

122-
doc = pymupdf.open(str(path))
123-
pages = []
124-
title = ""
125136

126-
for page_num, page in enumerate(doc):
127-
text = page.get_text()
128-
pages.append(text)
129-
130-
if page_num == 0 and not title:
131-
blocks = page.get_text("dict")["blocks"]
132-
best_size = 0
133-
for block in blocks:
134-
if "lines" not in block:
135-
continue
136-
for line in block["lines"]:
137-
for span in line["spans"]:
138-
if span["text"].strip() and span["size"] > best_size:
139-
best_size = span["size"]
140-
141-
if best_size > 0:
142-
candidates = []
143-
current_parts = []
144-
for block in blocks:
145-
if "lines" not in block:
146-
if current_parts:
147-
candidates.append(" ".join(current_parts))
148-
current_parts = []
149-
continue
150-
for line in block["lines"]:
151-
for span in line["spans"]:
152-
span_text = span["text"].strip()
153-
if not span_text:
154-
continue
155-
if abs(span["size"] - best_size) < 0.5:
156-
current_parts.append(span_text)
157-
elif current_parts:
158-
candidates.append(" ".join(current_parts))
159-
current_parts = []
160-
if current_parts:
161-
candidates.append(" ".join(current_parts))
162-
if candidates:
163-
title = max(candidates, key=len)
164-
165-
doc.close()
166-
full_text = "\n\n".join(pages)
137+
def _clean_pymupdf4llm_markdown(md: str) -> str:
138+
"""Post-process pymupdf4llm markdown for cleaner LLM input.
167139
168-
if not title:
169-
for line in full_text.split("\n"):
170-
if line.strip():
171-
title = line.strip()[:200]
172-
break
140+
- Strips the noisy '==> picture [WxH] intentionally omitted <==' lines.
141+
The pixel dimensions are meaningless and the phrasing distracts the LLM.
142+
Embedded figure text (chart labels, diagram text) and captions are kept.
143+
- Converts inline <br> separators in embedded text to newlines.
144+
"""
145+
out = []
146+
for line in md.split("\n"):
147+
# Drop picture placeholder lines (with or without bold **)
148+
if "intentionally omitted" in line:
149+
stripped = line.strip().strip("*").strip()
150+
if stripped.startswith("==>"):
151+
continue
152+
# Clean up <br> in embedded figure text lines
153+
if "<br>" in line:
154+
line = line.replace("<br>", "\n")
155+
out.append(line)
156+
return "\n".join(out)
173157

174-
return title, full_text
158+
159+
def _extract_title_from_markdown(markdown: str) -> str:
160+
"""Extract the first heading from markdown text as the title."""
161+
fallback = ""
162+
for line in markdown.split("\n"):
163+
stripped = line.strip()
164+
if not stripped:
165+
continue
166+
if stripped.startswith("#"):
167+
title = stripped.lstrip("# ").strip()
168+
# Strip bold markers that pymupdf4llm adds to headings
169+
return re.sub(r"\*\*(.+?)\*\*", r"\1", title)
170+
if not fallback:
171+
fallback = stripped[:200]
172+
return fallback
175173

176174

177175
def _parse_docx(path: Path) -> tuple[str, str]:
@@ -280,11 +278,69 @@ def _fetch_arxiv_pdf(url: str) -> tuple[str, str]:
280278
tmp_path.unlink(missing_ok=True)
281279

282280

281+
def _tabular_to_markdown(table_el) -> str:
282+
"""Convert a BS4 ltx_tabular element to a markdown table."""
283+
rows = []
284+
for tr in table_el.find_all("tr"):
285+
cells = tr.find_all(["td", "th"])
286+
if not cells:
287+
continue
288+
row = [
289+
cell.get_text(" ", strip=True).replace("|", r"\|").replace("\n", " ")
290+
for cell in cells
291+
]
292+
rows.append(row)
293+
294+
if not rows:
295+
return ""
296+
297+
ncols = max(len(r) for r in rows)
298+
299+
def pad(r):
300+
return r + [""] * (ncols - len(r))
301+
302+
lines = ["| " + " | ".join(pad(rows[0])) + " |",
303+
"| " + " | ".join(["---"] * ncols) + " |"]
304+
for row in rows[1:]:
305+
lines.append("| " + " | ".join(pad(row)[:ncols]) + " |")
306+
return "\n".join(lines)
307+
308+
309+
def _figure_or_table_to_markdown(fig_el) -> str:
310+
"""Convert an ltx_figure or ltx_table element to markdown text.
311+
312+
Tables become markdown tables (with caption above).
313+
Image figures are reduced to caption text only.
314+
"""
315+
caption_el = fig_el.find(class_="ltx_caption")
316+
caption = (caption_el.get_text(" ", strip=True) if caption_el else "").replace("\n", " ")
317+
318+
# Table figure: contains ltx_tabular
319+
tabular = fig_el.find(class_="ltx_tabular")
320+
if tabular:
321+
table_md = _tabular_to_markdown(tabular)
322+
if not table_md:
323+
return f"**{caption}**" if caption else ""
324+
return (f"**{caption}**\n\n{table_md}" if caption else table_md)
325+
326+
# Image figure: collect main graphics, skip tiny caption icons
327+
imgs = [
328+
img for img in fig_el.find_all("img", class_="ltx_graphics")
329+
if not (caption_el and caption_el.find(lambda t: t is img))
330+
and int(img.get("width", "100") or "100") >= 30
331+
]
332+
if not imgs:
333+
return f"**{caption}**" if caption else ""
334+
335+
return f"*{caption}*" if caption else ""
336+
337+
283338
def parse_arxiv_html(url: str) -> tuple[str, str]:
284339
"""Fetch and parse an arXiv HTML page into (title, full_text).
285340
286341
Works with arXiv HTML URLs like https://arxiv.org/html/2310.06825.
287342
The HTML is generated by LaTeXML and uses ltx_* CSS classes.
343+
Tables are converted to markdown tables; figures keep caption text only.
288344
"""
289345
from urllib.error import URLError
290346
from urllib.request import Request, urlopen
@@ -323,14 +379,35 @@ def parse_arxiv_html(url: str) -> tuple[str, str]:
323379
for el in doc.select(sel):
324380
el.decompose()
325381

382+
# Pre-process figures and tables: convert to markdown and replace with
383+
# ltx_para marker divs so they appear at the correct position in the flow.
384+
# Use exact class match (not substring) to avoid matching ltx_figure_panel etc.
385+
inserted_markers = False
386+
for fig in doc.find_all(lambda tag: _tag_has_exact_class(tag, "ltx_figure", "ltx_table")):
387+
md = _figure_or_table_to_markdown(fig)
388+
if md:
389+
marker = soup.new_tag("div")
390+
marker["class"] = "ltx_para"
391+
marker["data-oar-content"] = md
392+
fig.replace_with(marker)
393+
inserted_markers = True
394+
else:
395+
fig.decompose()
396+
326397
# Extract structured text using leaf content elements only.
327398
# ltx_para = paragraph text, ltx_title_* = headings, ltx_abstract = abstract,
328-
# ltx_theorem/ltx_proof = theorems, ltx_caption = figure captions.
399+
# ltx_theorem/ltx_proof = theorems. Captions are now handled via figure pre-processing.
329400
# We do NOT match ltx_section/ltx_subsection (containers that include all children).
330401
sections = []
331402
for element in doc.find_all(class_=re.compile(
332-
r"^ltx_(para$|title_|abstract$|theorem$|proof$|caption)"
403+
r"^ltx_(para$|title_|abstract$|theorem$|proof$)"
333404
)):
405+
# Figure/table markers: use pre-computed markdown directly
406+
oar_content = element.get("data-oar-content")
407+
if oar_content is not None:
408+
sections.append(oar_content)
409+
continue
410+
334411
text = element.get_text(" ", strip=True)
335412
if not text:
336413
continue
@@ -352,7 +429,7 @@ def parse_arxiv_html(url: str) -> tuple[str, str]:
352429
# Skip — already handled by ltx_abstract match
353430
continue
354431
elif cls_str.startswith("ltx_title"):
355-
# Other titles (theorem, proof, caption, etc.)
432+
# Other titles (theorem, proof, etc.)
356433
sections.append(f"\n**{text}**")
357434
elif "ltx_abstract" in cls_str:
358435
# Extract just paragraph text, skip the title child
@@ -370,7 +447,7 @@ def parse_arxiv_html(url: str) -> tuple[str, str]:
370447
full_text = "\n\n".join(sections)
371448

372449
# Fallback: if structured extraction got very little, use plain text
373-
if len(full_text) < 500:
450+
if len(full_text) < 500 and not inserted_markers:
374451
full_text = doc.get_text("\n", strip=True)
375452

376453
if not title:

0 commit comments

Comments
 (0)