Lulzx · Lulzx · Feb 28, 2026 · Feb 5, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/python/tests/test_zpdf.py b/python/tests/test_zpdf.py
@@ -210,5 +210,234 @@ def test_repeated_extraction(self):
                 assert isinstance(text, str)
 
 
+class TestReadingOrder:
+    """Test reading_order=True variant of extract_page."""
+
+    def test_extract_page_reading_order(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            text = doc.extract_page(0, reading_order=True)
+            assert isinstance(text, str)
+            assert len(text) > 0
+
+    def test_reading_order_vs_stream_order(self):
+        # Both modes must return non-empty strings; content may differ
+        with zpdf.Document(TEST_PDF) as doc:
+            stream = doc.extract_page(0, reading_order=False)
+            ordered = doc.extract_page(0, reading_order=True)
+            assert isinstance(stream, str)
+            assert isinstance(ordered, str)
+
+    def test_reading_order_invalid_page(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            with pytest.raises(zpdf.PageNotFoundError):
+                doc.extract_page(9999, reading_order=True)
+
+    @pytest.mark.skipif(not TAGGED_PDF.exists(), reason="Tagged PDF not available")
+    def test_reading_order_tagged_pdf(self):
+        with zpdf.Document(TAGGED_PDF) as doc:
+            text = doc.extract_page(0, reading_order=True)
+            assert isinstance(text, str)
+
+
+class TestMarkdown:
+    """Test Markdown extraction methods."""
+
+    def test_extract_page_markdown_returns_str(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            md = doc.extract_page_markdown(0)
+            assert isinstance(md, str)
+
+    def test_extract_page_markdown_non_empty(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            md = doc.extract_page_markdown(0)
+            assert len(md) > 0
+
+    def test_extract_page_markdown_invalid_page(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            with pytest.raises(zpdf.PageNotFoundError):
+                doc.extract_page_markdown(9999)
+
+    def test_extract_page_markdown_negative_page(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            with pytest.raises(zpdf.PageNotFoundError):
+                doc.extract_page_markdown(-1)
+
+    def test_extract_all_markdown_returns_str(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            md = doc.extract_all_markdown()
+            assert isinstance(md, str)
+
+    def test_extract_all_markdown_non_empty(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            md = doc.extract_all_markdown()
+            assert len(md) > 0
+
+    @pytest.mark.skipif(not TAGGED_PDF.exists(), reason="Tagged PDF not available")
+    def test_extract_all_markdown_tagged(self):
+        with zpdf.Document(TAGGED_PDF) as doc:
+            md = doc.extract_all_markdown()
+            assert isinstance(md, str)
+            assert len(md) > 0
+
+    def test_extract_all_markdown_after_close(self):
+        doc = zpdf.Document(TEST_PDF)
+        doc.close()
+        with pytest.raises(ValueError, match="closed"):
+            doc.extract_all_markdown()
+
+    def test_extract_page_markdown_after_close(self):
+        doc = zpdf.Document(TEST_PDF)
+        doc.close()
+        with pytest.raises(ValueError, match="closed"):
+            doc.extract_page_markdown(0)
+
+
+class TestTextSpan:
+    """Test TextSpan fields and computed properties."""
+
+    def test_span_fields_present(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            spans = doc.extract_bounds(0)
+            if not spans:
+                pytest.skip("No spans on page 0")
+            s = spans[0]
+            assert isinstance(s.x0, float)
+            assert isinstance(s.y0, float)
+            assert isinstance(s.x1, float)
+            assert isinstance(s.y1, float)
+            assert isinstance(s.text, str)
+            assert isinstance(s.font_size, float)
+
+    def test_span_width_height(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            spans = doc.extract_bounds(0)
+            if not spans:
+                pytest.skip("No spans on page 0")
+            for s in spans:
+                assert s.width == pytest.approx(s.x1 - s.x0)
+                assert s.height == pytest.approx(s.y1 - s.y0)
+
+    def test_span_font_size_positive(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            spans = doc.extract_bounds(0)
+            if not spans:
+                pytest.skip("No spans on page 0")
+            for s in spans:
+                assert s.font_size >= 0
+
+    def test_span_repr(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            spans = doc.extract_bounds(0)
+            if not spans:
+                pytest.skip("No spans on page 0")
+            r = repr(spans[0])
+            assert "TextSpan" in r
+            assert "text=" in r
+
+    def test_span_text_nonempty(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            spans = doc.extract_bounds(0)
+            if not spans:
+                pytest.skip("No spans on page 0")
+            # At least some spans should have non-empty text
+            assert any(s.text.strip() for s in spans)
+
+
+class TestDocumentLen:
+    """Test __len__ on Document."""
+
+    def test_len_equals_page_count(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            assert len(doc) == doc.page_count
+
+    def test_len_after_close(self):
+        doc = zpdf.Document(TEST_PDF)
+        doc.close()
+        with pytest.raises(ValueError, match="closed"):
+            _ = len(doc)
+
+
+class TestPageInfoRepr:
+    """Test PageInfo repr."""
+
+    def test_page_info_repr(self):
+        with zpdf.Document(TEST_PDF) as doc:
+            info = doc.get_page_info(0)
+            r = repr(info)
+            assert "PageInfo" in r
+            assert "width=" in r
+            assert "height=" in r
+
+
+class TestMultiPageSeparators:
+    """Test that multi-page documents include page separators."""
+
+    @pytest.mark.skipif(not ACROBAT_PDF.exists(), reason="Acrobat PDF not available")
+    def test_extract_all_has_page_separators(self):
+        with zpdf.Document(ACROBAT_PDF) as doc:
+            if doc.page_count < 2:
+                pytest.skip("Need multi-page document")
+            text = doc.extract_all()
+            # Form-feed (\x0c) is used as page separator
+            assert "\x0c" in text
+
+
+class TestTaggedPDFSuite:
+    """Test extraction on the full benchmark tagged PDF suite."""
+
+    TAGGED_PDFS = [
+        "PDFUA-Ref-2-01_Magazine-danish.pdf",
+        "PDFUA-Ref-2-02_Invoice.pdf",
+        "PDFUA-Ref-2-03_AcademicAbstract.pdf",
+        "PDFUA-Ref-2-04_Presentation.pdf",
+        "PDFUA-Ref-2-05_BookChapter-german.pdf",
+        "PDFUA-Ref-2-06_Brochure.pdf",
+        "PDFUA-Ref-2-08_BookChapter.pdf",
+    ]
+    BENCHMARK_DIR = Path(__file__).parent.parent.parent / "benchmark"
+
+    @pytest.mark.parametrize("filename", TAGGED_PDFS)
+    def test_tagged_pdf_extract_all(self, filename):
+        path = self.BENCHMARK_DIR / filename
+        if not path.exists():
+            pytest.skip(f"{filename} not available")
+        with zpdf.Document(path) as doc:
+            text = doc.extract_all()
+            assert isinstance(text, str)
+
+    @pytest.mark.parametrize("filename", TAGGED_PDFS)
+    def test_tagged_pdf_markdown(self, filename):
+        path = self.BENCHMARK_DIR / filename
+        if not path.exists():
+            pytest.skip(f"{filename} not available")
+        with zpdf.Document(path) as doc:
+            md = doc.extract_all_markdown()
+            assert isinstance(md, str)
+
+
+class TestMalformedPDFRobustness:
+    """Ensure malformed PDFs do not crash — they should either parse or raise a known error."""
+
+    CORPUS_DIR = Path(__file__).parent.parent.parent / "test" / "Test_Corpus"
+
+    @pytest.mark.skipif(
+        not (Path(__file__).parent.parent.parent / "test" / "Test_Corpus").exists(),
+        reason="Test_Corpus not available",
+    )
+    @pytest.mark.parametrize("pdf_path", sorted(
+        (Path(__file__).parent.parent.parent / "test" / "Test_Corpus").glob("*.pdf")
+    ))
+    def test_malformed_no_crash(self, pdf_path):
+        try:
+            with zpdf.Document(pdf_path) as doc:
+                # If we opened it, try extracting text — must not crash
+                try:
+                    _ = doc.extract_all()
+                except (zpdf.ExtractionError, zpdf.PageNotFoundError):
+                    pass  # Known errors are fine
+        except zpdf.InvalidPdfError:
+            pass  # Expected for many malformed files
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/src/decompress.zig b/src/decompress.zig
@@ -11,7 +11,7 @@
 //! We optimize heavily for FlateDecode since it's the hot path.
 
 const std = @import("std");
-const Object = @import("root.zig").Object;
+const Object = @import("parser.zig").Object;
 
 pub const DecompressError = error{
     UnsupportedFilter,