Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 229 additions & 0 deletions python/tests/test_zpdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,5 +210,234 @@ def test_repeated_extraction(self):
assert isinstance(text, str)


class TestReadingOrder:
"""Test reading_order=True variant of extract_page."""

def test_extract_page_reading_order(self):
with zpdf.Document(TEST_PDF) as doc:
text = doc.extract_page(0, reading_order=True)
assert isinstance(text, str)
assert len(text) > 0

def test_reading_order_vs_stream_order(self):
# Both modes must return non-empty strings; content may differ
with zpdf.Document(TEST_PDF) as doc:
stream = doc.extract_page(0, reading_order=False)
ordered = doc.extract_page(0, reading_order=True)
assert isinstance(stream, str)
assert isinstance(ordered, str)

def test_reading_order_invalid_page(self):
with zpdf.Document(TEST_PDF) as doc:
with pytest.raises(zpdf.PageNotFoundError):
doc.extract_page(9999, reading_order=True)

@pytest.mark.skipif(not TAGGED_PDF.exists(), reason="Tagged PDF not available")
def test_reading_order_tagged_pdf(self):
with zpdf.Document(TAGGED_PDF) as doc:
text = doc.extract_page(0, reading_order=True)
assert isinstance(text, str)


class TestMarkdown:
"""Test Markdown extraction methods."""

def test_extract_page_markdown_returns_str(self):
with zpdf.Document(TEST_PDF) as doc:
md = doc.extract_page_markdown(0)
assert isinstance(md, str)

def test_extract_page_markdown_non_empty(self):
with zpdf.Document(TEST_PDF) as doc:
md = doc.extract_page_markdown(0)
assert len(md) > 0

def test_extract_page_markdown_invalid_page(self):
with zpdf.Document(TEST_PDF) as doc:
with pytest.raises(zpdf.PageNotFoundError):
doc.extract_page_markdown(9999)

def test_extract_page_markdown_negative_page(self):
with zpdf.Document(TEST_PDF) as doc:
with pytest.raises(zpdf.PageNotFoundError):
doc.extract_page_markdown(-1)

def test_extract_all_markdown_returns_str(self):
with zpdf.Document(TEST_PDF) as doc:
md = doc.extract_all_markdown()
assert isinstance(md, str)

def test_extract_all_markdown_non_empty(self):
with zpdf.Document(TEST_PDF) as doc:
md = doc.extract_all_markdown()
assert len(md) > 0

@pytest.mark.skipif(not TAGGED_PDF.exists(), reason="Tagged PDF not available")
def test_extract_all_markdown_tagged(self):
with zpdf.Document(TAGGED_PDF) as doc:
md = doc.extract_all_markdown()
assert isinstance(md, str)
assert len(md) > 0

def test_extract_all_markdown_after_close(self):
doc = zpdf.Document(TEST_PDF)
doc.close()
with pytest.raises(ValueError, match="closed"):
doc.extract_all_markdown()

def test_extract_page_markdown_after_close(self):
doc = zpdf.Document(TEST_PDF)
doc.close()
with pytest.raises(ValueError, match="closed"):
doc.extract_page_markdown(0)


class TestTextSpan:
"""Test TextSpan fields and computed properties."""

def test_span_fields_present(self):
with zpdf.Document(TEST_PDF) as doc:
spans = doc.extract_bounds(0)
if not spans:
pytest.skip("No spans on page 0")
s = spans[0]
assert isinstance(s.x0, float)
assert isinstance(s.y0, float)
assert isinstance(s.x1, float)
assert isinstance(s.y1, float)
assert isinstance(s.text, str)
assert isinstance(s.font_size, float)

def test_span_width_height(self):
with zpdf.Document(TEST_PDF) as doc:
spans = doc.extract_bounds(0)
if not spans:
pytest.skip("No spans on page 0")
for s in spans:
assert s.width == pytest.approx(s.x1 - s.x0)
assert s.height == pytest.approx(s.y1 - s.y0)

def test_span_font_size_positive(self):
with zpdf.Document(TEST_PDF) as doc:
spans = doc.extract_bounds(0)
if not spans:
pytest.skip("No spans on page 0")
for s in spans:
assert s.font_size >= 0

def test_span_repr(self):
with zpdf.Document(TEST_PDF) as doc:
spans = doc.extract_bounds(0)
if not spans:
pytest.skip("No spans on page 0")
r = repr(spans[0])
assert "TextSpan" in r
assert "text=" in r

def test_span_text_nonempty(self):
with zpdf.Document(TEST_PDF) as doc:
spans = doc.extract_bounds(0)
if not spans:
pytest.skip("No spans on page 0")
# At least some spans should have non-empty text
assert any(s.text.strip() for s in spans)


class TestDocumentLen:
"""Test __len__ on Document."""

def test_len_equals_page_count(self):
with zpdf.Document(TEST_PDF) as doc:
assert len(doc) == doc.page_count

def test_len_after_close(self):
doc = zpdf.Document(TEST_PDF)
doc.close()
with pytest.raises(ValueError, match="closed"):
_ = len(doc)


class TestPageInfoRepr:
"""Test PageInfo repr."""

def test_page_info_repr(self):
with zpdf.Document(TEST_PDF) as doc:
info = doc.get_page_info(0)
r = repr(info)
assert "PageInfo" in r
assert "width=" in r
assert "height=" in r


class TestMultiPageSeparators:
"""Test that multi-page documents include page separators."""

@pytest.mark.skipif(not ACROBAT_PDF.exists(), reason="Acrobat PDF not available")
def test_extract_all_has_page_separators(self):
with zpdf.Document(ACROBAT_PDF) as doc:
if doc.page_count < 2:
pytest.skip("Need multi-page document")
text = doc.extract_all()
# Form-feed (\x0c) is used as page separator
assert "\x0c" in text


class TestTaggedPDFSuite:
"""Test extraction on the full benchmark tagged PDF suite."""

TAGGED_PDFS = [
"PDFUA-Ref-2-01_Magazine-danish.pdf",
"PDFUA-Ref-2-02_Invoice.pdf",
"PDFUA-Ref-2-03_AcademicAbstract.pdf",
"PDFUA-Ref-2-04_Presentation.pdf",
"PDFUA-Ref-2-05_BookChapter-german.pdf",
"PDFUA-Ref-2-06_Brochure.pdf",
"PDFUA-Ref-2-08_BookChapter.pdf",
]
BENCHMARK_DIR = Path(__file__).parent.parent.parent / "benchmark"

@pytest.mark.parametrize("filename", TAGGED_PDFS)
def test_tagged_pdf_extract_all(self, filename):
path = self.BENCHMARK_DIR / filename
if not path.exists():
pytest.skip(f"{filename} not available")
with zpdf.Document(path) as doc:
text = doc.extract_all()
assert isinstance(text, str)

@pytest.mark.parametrize("filename", TAGGED_PDFS)
def test_tagged_pdf_markdown(self, filename):
path = self.BENCHMARK_DIR / filename
if not path.exists():
pytest.skip(f"{filename} not available")
with zpdf.Document(path) as doc:
md = doc.extract_all_markdown()
assert isinstance(md, str)


class TestMalformedPDFRobustness:
"""Ensure malformed PDFs do not crash — they should either parse or raise a known error."""

CORPUS_DIR = Path(__file__).parent.parent.parent / "test" / "Test_Corpus"

@pytest.mark.skipif(
not (Path(__file__).parent.parent.parent / "test" / "Test_Corpus").exists(),
reason="Test_Corpus not available",
)
@pytest.mark.parametrize("pdf_path", sorted(
(Path(__file__).parent.parent.parent / "test" / "Test_Corpus").glob("*.pdf")
))
def test_malformed_no_crash(self, pdf_path):
try:
with zpdf.Document(pdf_path) as doc:
# If we opened it, try extracting text — must not crash
try:
_ = doc.extract_all()
except (zpdf.ExtractionError, zpdf.PageNotFoundError):
pass # Known errors are fine
except zpdf.InvalidPdfError:
pass # Expected for many malformed files


if __name__ == "__main__":
pytest.main([__file__, "-v"])
2 changes: 1 addition & 1 deletion src/decompress.zig
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
//! We optimize heavily for FlateDecode since it's the hot path.

const std = @import("std");
const Object = @import("root.zig").Object;
const Object = @import("parser.zig").Object;

pub const DecompressError = error{
UnsupportedFilter,
Expand Down
Loading