From 58e7c8be408856dcdd3799ac7ab45356425b1844 Mon Sep 17 00:00:00 2001
From: Arnav Goel <arnavgoel2@gmail.com>
Date: Wed, 20 May 2026 02:54:40 -0400
Subject: [PATCH] chunker(range-tighten + size-cap): correctness clip + opt-in
 row-group splitter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two chunker-side changes, neither moves retrieval recall — explicit
calibration on the full 912 SpreadsheetBench v0.1 with both BGE-small
and text-embedding-3-large shows 0 instance flips for either change.
Shipping for correctness, not recall.

A. _tight_content_bbox + clip in _block_to_chunk

   Walks `block.cell_range`, finds the bbox of cells whose `raw_value`
   or non-empty `display_value` is set, and clips the chunk's claimed
   `(top_left_cell, bottom_right_cell)` to that bbox. Fixes the over-
   claim pathology where a sheet with styled-empty cells across XFD
   width produces a chunk claiming `A1:XFD4` despite the actual data
   sitting in a 5×3 corner. The renderer already iterates the original
   range, so the narrowed claim is always a superset of cells that
   contributed to render_text — invariant preserved.

   Bench (full 912 / text-embedding-3-large):
     recall_text@5:           0.750 → 0.750  (no change)
     recall_geometric@5:      0.482 → 0.482  (no change)
     recall_text@5_in_scope:  0.838 → 0.838  (no change)
     mean parse_ms:           156   → 174    (+18 ms; bbox walk)
     net instance flips:      0 miss→hit, 0 hit→miss

   Why no recall change despite fixing a real over-claim: the over-
   claims happen on sheets with empty-XFD blocks; the GT cells on
   those sheets are in OTHER blocks (proper data regions). So the
   over-claim was already a false negative for geom scoring, not a
   false positive that needed correction. The pathology lives in the
   dead zone of the retrieval metric. Citation UIs that highlight the
   chunk's claimed range still benefit — that's the actual value here.

B. _split_block_by_rows + KS_CHUNK_BUDGET_CHARS env var

   When a block's render_text exceeds a configurable budget, split it
   into row-group sub-blocks with tight A1 ranges and non-overlapping
   coverage. Default budget is 100,000 chars — effectively OFF for any
   reasonable workbook on this corpus. Calibration on the 50-sample
   showed every smaller budget (2k, 4k, 8k) was net-zero or net-
   negative on retrieval recall because the embedding cannot
   discriminate between same-shape row-group children. Lower it via
   `KS_CHUNK_BUDGET_CHARS=2000` only if your downstream consumer has
   a strict per-chunk token economy that demands fragmentation; bench
   any such change against your own corpus first.

Tests: tests/test_chunker_range_tighten.py (3 cases) + tests/test_chunker_size_cap.py
(5 cases) — 8 added, 1071 → 1079 total passing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ks_xlsx_parser/chunking/chunker.py | 207 ++++++++++++++++++++++++-
 tests/test_chunker_range_tighten.py    | 143 +++++++++++++++++
 tests/test_chunker_size_cap.py         | 162 +++++++++++++++++++
 3 files changed, 508 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_chunker_range_tighten.py
 create mode 100644 tests/test_chunker_size_cap.py

diff --git a/src/ks_xlsx_parser/chunking/chunker.py b/src/ks_xlsx_parser/chunking/chunker.py
index 7057e78..d16bbf6 100644
--- a/src/ks_xlsx_parser/chunking/chunker.py
+++ b/src/ks_xlsx_parser/chunking/chunker.py
@@ -12,9 +12,10 @@
 from __future__ import annotations
 
 import logging
+import os
 
 from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary
-from ks_xlsx_parser.models.common import CellCoord, EdgeType
+from ks_xlsx_parser.models.common import CellCoord, CellRange, EdgeType
 from ks_xlsx_parser.models.sheet import SheetDTO
 from ks_xlsx_parser.models.workbook import WorkbookDTO
 from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer
@@ -25,6 +26,166 @@
 # Approximate tokens per character for English text (conservative estimate)
 CHARS_PER_TOKEN = 4
 
+# Cluster-04 chunk-size cap. A block whose rendered text exceeds this
+# many chars gets split into row-group children, each with a tight A1
+# range covering only its rows.
+#
+# Default is intentionally HIGH (effectively off for any reasonable
+# workbook). Empirical reason: on the 50-sample stratified for the
+# cluster-04 large-sheet pattern, every budget tight enough to actually
+# fragment moderate tables (2k–4k chars) regressed at least one
+# instance because the embedding cannot discriminate between
+# same-shape row-group children — splitting one chunk into N
+# look-alike chunks makes the embedding pick the wrong child first.
+# An 8k budget was neutral (0 wins, 0 regressions). Above that no
+# real-world tables we measured split. So keep tables whole by
+# default; the splitter is plumbing for downstream consumers who
+# need hard per-chunk token economy (e.g. an embedding model with a
+# strict 512-token ceiling). They opt in via
+# ``KS_CHUNK_BUDGET_CHARS=2000`` and validate against THEIR corpus.
+def _budget_chars() -> int:
+    """Read budget per call so tests can monkeypatch the env var."""
+    return int(os.environ.get("KS_CHUNK_BUDGET_CHARS", "100000"))
+
+# Below this row count, splitting can't produce ≥2 children meaningfully
+# (need at least 2 data rows to split into 2 groups).
+MIN_ROWS_TO_SPLIT = 3
+
+
+def _split_block_by_rows(
+    block: BlockDTO,
+    sheet: SheetDTO,
+    text_renderer: TextRenderer,
+    budget_chars: int | None = None,
+) -> list[BlockDTO]:
+    """Split an oversize block into contiguous row-group sub-blocks.
+
+    Returns ``[block]`` unchanged when the block fits the budget. Otherwise
+    returns N sub-blocks with non-overlapping, contiguous row ranges
+    summing to the original block's row coverage. Each sub-block carries
+    a copy of the parent's block-level metadata (block_type, table_name,
+    named_ranges, density flags) and a subset of ``key_cells`` whose rows
+    fall inside the slice.
+
+    Header preservation: this v1 emits row-group children WITHOUT
+    repeating the parent's header row(s). The cluster-04 doc calls
+    out "every group needs the header rows OR a header-summary block";
+    we accept the simpler invariant here because the alternative
+    (overlapping cell_range across siblings, or out-of-band header
+    text whose cells live outside the chunk's claimed range) breaks
+    cluster-02's range-tightening invariant. Header-context recovery
+    is a follow-up.
+    """
+    if budget_chars is None:
+        budget_chars = _budget_chars()
+    rng = block.cell_range
+    n_rows = rng.bottom_right.row - rng.top_left.row + 1
+    if n_rows < MIN_ROWS_TO_SPLIT:
+        return [block]
+
+    # Render once to measure. text_renderer is fast (~ms per 1000-row block);
+    # we already had to render in _block_to_chunk anyway — this just moves
+    # the work earlier.
+    full_text = text_renderer.render_block(block)
+    if len(full_text) <= budget_chars:
+        return [block]
+
+    # Estimate rows-per-child from observed chars/row (averaged over the
+    # full render, including header overhead). Ceil to ensure children
+    # don't individually exceed the budget on the high end.
+    avg_chars_per_row = max(1.0, len(full_text) / n_rows)
+    rows_per_group = max(2, int(budget_chars / avg_chars_per_row))
+
+    sub_blocks: list[BlockDTO] = []
+    cur = rng.top_left.row
+    while cur <= rng.bottom_right.row:
+        end = min(cur + rows_per_group - 1, rng.bottom_right.row)
+        sub_range = CellRange(
+            top_left=CellCoord(row=cur, col=rng.top_left.col),
+            bottom_right=CellCoord(row=end, col=rng.bottom_right.col),
+        )
+        # Carry only the key_cells whose row falls inside this slice.
+        # Important for downstream metadata fidelity (key_cells encode
+        # "notable output cells" — bold, colored, etc.).
+        sub_key_cells = [k for k in block.key_cells if cur <= k.row <= end]
+        sub = BlockDTO(
+            block_index=block.block_index,
+            sheet_name=block.sheet_name,
+            block_type=block.block_type,
+            cell_range=sub_range,
+            bounding_box=block.bounding_box,
+            cell_count=block.cell_count,
+            formula_count=block.formula_count,
+            has_merges=block.has_merges,
+            has_formatting=block.has_formatting,
+            key_cells=sub_key_cells,
+            named_ranges=block.named_ranges,
+            table_name=block.table_name,
+            parent_block_id=block.block_id or None,
+            density=block.density,
+            label_cell_count=block.label_cell_count,
+            data_cell_count=block.data_cell_count,
+            table_structure_id=block.table_structure_id,
+        )
+        sub_blocks.append(sub)
+        cur = end + 1
+
+    return sub_blocks
+
+
+def _tight_content_bbox(
+    block: BlockDTO, sheet: SheetDTO,
+) -> tuple[int, int, int, int] | None:
+    """Bounding box of cells with non-empty content within ``block.cell_range``.
+
+    Returns ``(r0, c0, r1, c1)`` or ``None`` if the entire range is
+    empty. "Non-empty" = ``raw_value`` is not None OR ``display_value``
+    has a non-whitespace string. Merged-slave cells count as empty
+    (their master is what carries the value).
+
+    Used to clip overclaiming chunk ranges before emission so that
+    geometric overlap scoring (cluster-02 cohort) is honest. A sheet
+    that has data in A1:E50 but whose segmenter handed us A1:XFD500
+    used to over-claim coverage of any GT cell in the empty area;
+    after the clip, the chunk only claims what it can actually
+    answer questions about.
+
+    Iterates ``sheet.cells`` rather than the dense range — many sheets
+    have styled-empty cells stored across XFD columns, and the dense
+    loop costs ~65k iterations per row for those.
+    """
+    rng = block.cell_range
+    r0_, c0_ = rng.top_left.row, rng.top_left.col
+    r1_, c1_ = rng.bottom_right.row, rng.bottom_right.col
+
+    min_row = min_col = None
+    max_row = max_col = None
+    # sheet.cells is dict[str, CellDTO] keyed "row,col"; iterate values
+    # and read coord off the DTO. Avoids iterating the dense range when
+    # styled-empty cells span XFD width (16384 cols).
+    for cell in sheet.cells.values():
+        r, c = cell.coord.row, cell.coord.col
+        if not (r0_ <= r <= r1_ and c0_ <= c <= c1_):
+            continue
+        if cell.is_merged_slave:
+            continue
+        if cell.raw_value is None and not (
+            cell.display_value and cell.display_value.strip()
+        ):
+            continue
+        if min_row is None or r < min_row:
+            min_row = r
+        if max_row is None or r > max_row:
+            max_row = r
+        if min_col is None or c < min_col:
+            min_col = c
+        if max_col is None or c > max_col:
+            max_col = c
+
+    if min_row is None:
+        return None
+    return (min_row, min_col, max_row, max_col)
+
 
 class ChunkBuilder:
     """
@@ -92,6 +253,23 @@ def build_all(self) -> list[ChunkDTO]:
             html_renderer = HtmlRenderer(sheet)
             text_renderer = TextRenderer(sheet)
 
+            # Cluster-04 size cap: replace oversize blocks with row-group
+            # children before chunk emission. Finalize the children with
+            # the workbook hash so their block_ids are stable IDs.
+            expanded_blocks: list[BlockDTO] = []
+            for block in blocks:
+                children = _split_block_by_rows(block, sheet, text_renderer)
+                if len(children) > 1:
+                    # Re-finalize each child with the workbook hash so its
+                    # block_id is deterministic. (The parent kept its own
+                    # block_id from the earlier finalize; children inherit
+                    # parent_block_id and get a fresh ID off the child's
+                    # narrower cell_range.)
+                    for child in children:
+                        child.finalize(self._workbook.workbook_hash)
+                expanded_blocks.extend(children)
+            blocks = expanded_blocks
+
             for block in blocks:
                 chunk = self._block_to_chunk(
                     block, sheet, html_renderer, text_renderer
@@ -154,12 +332,33 @@ def _block_to_chunk(
             for coord in block.key_cells
         ]
 
+        # Cluster-02 invariant: the chunk's claimed A1 range must be a
+        # tight bbox over cells with content. Overclaiming inflates
+        # geometric recall for the parser but lies to downstream
+        # consumers (a UI citation that highlights an empty area).
+        # We clip but never widen — the renderer already only outputs
+        # cells inside `block.cell_range`, so a narrowed range still
+        # contains every cell that contributed to `render_text`.
+        tight = _tight_content_bbox(block, sheet)
+        if tight is not None:
+            r0, c0, r1, c1 = tight
+            chunk_range = CellRange(
+                top_left=CellCoord(row=r0, col=c0),
+                bottom_right=CellCoord(row=r1, col=c1),
+            )
+        else:
+            # Block has zero non-empty cells (styled-empty cells across
+            # XFD columns is a real shape on this corpus). Claim only the
+            # top-left cell so the chunk is honest about being empty.
+            tl = block.cell_range.top_left
+            chunk_range = CellRange(top_left=tl, bottom_right=tl)
+
         return ChunkDTO(
             sheet_name=block.sheet_name,
             block_type=block.block_type,
-            top_left_cell=block.cell_range.top_left.to_a1(),
-            bottom_right_cell=block.cell_range.bottom_right.to_a1(),
-            cell_range=block.cell_range,
+            top_left_cell=chunk_range.top_left.to_a1(),
+            bottom_right_cell=chunk_range.bottom_right.to_a1(),
+            cell_range=chunk_range,
             key_cells=key_cells,
             named_ranges=block.named_ranges,
             dependency_summary=dep_summary,
diff --git a/tests/test_chunker_range_tighten.py b/tests/test_chunker_range_tighten.py
new file mode 100644
index 0000000..c16d6af
--- /dev/null
+++ b/tests/test_chunker_range_tighten.py
@@ -0,0 +1,143 @@
+"""Range-tightening invariant for chunk emission.
+
+Cluster-02 closes when every chunk's claimed (top_left, bottom_right)
+range is the bounding box of cells with non-empty content within
+``block.cell_range``. Today the chunker copies ``block.cell_range``
+verbatim — when the segmenter hands it a block whose range is wider
+than the data (empty rows/cols at the edges), the chunk overclaims
+geometric coverage.
+
+These tests assert the invariant on real fixtures + a hand-built
+"data in the upper-left corner of a wider block" workbook.
+"""
+from __future__ import annotations
+
+import openpyxl
+
+from ks_xlsx_parser.api import parse_workbook
+from ks_xlsx_parser.models.common import addr_to_a1
+
+
+def _a1_to_coord(a1: str) -> tuple[int, int]:
+    """Inverse of addr_to_a1 — small helper for tests only."""
+    letters = "".join(c for c in a1 if c.isalpha())
+    digits = "".join(c for c in a1 if c.isdigit())
+    col = 0
+    for ch in letters.upper():
+        col = col * 26 + (ord(ch) - ord("A") + 1)
+    return int(digits), col
+
+
+def test_chunk_range_tight_around_actual_content(tmp_path):
+    """A block whose range covers A1:E20 but only has data in A1:C5
+    must emit a chunk that claims A1:C5 (or tighter), not A1:E20."""
+    p = tmp_path / "sparse.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    # Fill only the upper-left 5×3 region; rest of the block is empty.
+    ws["A1"] = "Name"
+    ws["B1"] = "Q1"
+    ws["C1"] = "Q2"
+    for i, name in enumerate(["Alpha", "Bravo", "Charlie", "Delta"], start=2):
+        ws.cell(row=i, column=1, value=name)
+        ws.cell(row=i, column=2, value=i * 10)
+        ws.cell(row=i, column=3, value=i * 20)
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert chunks_on_S, "no chunks emitted on S"
+    for c in chunks_on_S:
+        r1, c1 = _a1_to_coord(c.bottom_right_cell)
+        # Bottom-right of the claimed range must not exceed the data bbox.
+        assert r1 <= 5, (
+            f"chunk over-claims rows: {c.top_left_cell}:{c.bottom_right_cell} "
+            f"but data ends at row 5"
+        )
+        assert c1 <= 3, (
+            f"chunk over-claims cols: {c.top_left_cell}:{c.bottom_right_cell} "
+            f"but data ends at col C"
+        )
+
+
+def test_chunk_range_unchanged_when_block_is_already_tight(tmp_path):
+    """If block.cell_range already matches the data bbox, the chunk's
+    claimed range must be identical — no spurious narrowing."""
+    p = tmp_path / "dense.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "h1"
+    ws["B1"] = "h2"
+    ws["A2"] = 1
+    ws["B2"] = 2
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert chunks_on_S
+    # The single dense block should claim A1:B2 exactly.
+    found_tight = any(
+        c.top_left_cell == "A1" and c.bottom_right_cell == "B2"
+        for c in chunks_on_S
+    )
+    assert found_tight, [
+        (c.top_left_cell, c.bottom_right_cell) for c in chunks_on_S
+    ]
+
+
+def test_chunk_range_invariant_holds_on_corpus_fixture():
+    """End-to-end on a real SpreadsheetBench input — every emitted
+    chunk's range must be a tight bbox over cells with content.
+    Uses one of the 50-sample sheets; skipped if corpus absent."""
+    import os
+    p = (
+        "data/corpora/spreadsheetbench/all_data_912_v0.1"
+        "/spreadsheet/54105/1_54105_input.xlsx"
+    )
+    if not os.path.exists(p):
+        import pytest
+        pytest.skip(f"corpus fixture not present: {p}")
+
+    result = parse_workbook(path=p)
+    import openpyxl
+    wb = openpyxl.load_workbook(p, data_only=True)
+
+    for c in result.chunks:
+        if not c.top_left_cell or not c.bottom_right_cell:
+            continue
+        ws = wb[c.sheet_name]
+        r0, c0 = _a1_to_coord(c.top_left_cell)
+        r1, c1 = _a1_to_coord(c.bottom_right_cell)
+        # The CORNER rows/cols of the claimed range must each contain
+        # some non-empty cell — otherwise the range is over-claiming.
+        def _row_has_content(r: int) -> bool:
+            for col in range(c0, c1 + 1):
+                v = ws.cell(row=r, column=col).value
+                if v is not None and str(v).strip():
+                    return True
+            return False
+
+        def _col_has_content(col: int) -> bool:
+            for r in range(r0, r1 + 1):
+                v = ws.cell(row=r, column=col).value
+                if v is not None and str(v).strip():
+                    return True
+            return False
+
+        # Allow chart anchors (single-cell, no real content) to pass.
+        if r0 == r1 and c0 == c1:
+            continue
+        assert _row_has_content(r0), (
+            f"top row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )
+        assert _row_has_content(r1), (
+            f"bottom row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )
+        assert _col_has_content(c0), (
+            f"left col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )
+        assert _col_has_content(c1), (
+            f"right col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
+        )
diff --git a/tests/test_chunker_size_cap.py b/tests/test_chunker_size_cap.py
new file mode 100644
index 0000000..6db0ade
--- /dev/null
+++ b/tests/test_chunker_size_cap.py
@@ -0,0 +1,162 @@
+"""Token-budget chunk size cap.
+
+Cluster-04: single huge chunks dilute embeddings — when a sheet has a
+1000-row table, putting it in one chunk means the embedding mixes the
+question-relevant rows with 999 others and recall@5 suffers. The fix
+is to split oversize blocks into row groups, each with a tight A1
+range covering only its rows.
+
+These tests assert:
+  1. A synthetic 1000-row table emits ≥ 2 chunks (was 1 before).
+  2. No chunk's render_text exceeds the budget (~2000 chars).
+  3. Each child chunk's range is contiguous and non-overlapping with
+     siblings.
+  4. Total row coverage of children == original block's row range
+     (no data dropped, none duplicated).
+  5. Small blocks (≤ budget) are emitted unchanged — no spurious
+     splitting on dense single-table sheets.
+"""
+from __future__ import annotations
+
+import openpyxl
+
+from ks_xlsx_parser.api import parse_workbook
+
+
+def _a1_to_coord(a1: str) -> tuple[int, int]:
+    letters = "".join(c for c in a1 if c.isalpha())
+    digits = "".join(c for c in a1 if c.isdigit())
+    col = 0
+    for ch in letters.upper():
+        col = col * 26 + (ord(ch) - ord("A") + 1)
+    return int(digits), col
+
+
+def test_thousand_row_table_splits_into_multiple_chunks(tmp_path, monkeypatch):
+    """With the cap explicitly tightened (KS_CHUNK_BUDGET_CHARS=2000) a
+    1000-row table must emit ≥ 2 chunks. The default budget is much
+    higher (cap effectively off) — see test_default_keeps_tables_whole
+    for the inverse assertion."""
+    monkeypatch.setenv("KS_CHUNK_BUDGET_CHARS", "2000")
+    p = tmp_path / "big.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws.append(["id", "name", "value"])
+    for i in range(1, 1001):
+        ws.append([i, f"name-{i}", i * 7])
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert len(chunks_on_S) >= 2, (
+        f"expected ≥2 chunks after cap, got {len(chunks_on_S)} "
+        f"(render_text lengths: "
+        f"{[len(c.render_text or '') for c in chunks_on_S]})"
+    )
+
+
+def test_default_keeps_tables_whole(tmp_path):
+    """The shipped default budget (~100k chars) deliberately does NOT
+    fragment moderate tables — calibration on the 50-sample showed
+    splitting moderate tables (1k–10k chars) regresses retrieval
+    because the embedding can't discriminate between same-shape
+    children. Keep tables together by default."""
+    p = tmp_path / "big.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws.append(["id", "name", "value"])
+    for i in range(1, 1001):
+        ws.append([i, f"name-{i}", i * 7])
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert len(chunks_on_S) == 1, (
+        f"default should not split a 1000-row table; got "
+        f"{len(chunks_on_S)} chunks"
+    )
+
+
+def test_no_chunk_exceeds_render_budget(tmp_path, monkeypatch):
+    """Each chunk's render_text must stay near the configured budget
+    when the cap is engaged."""
+    monkeypatch.setenv("KS_CHUNK_BUDGET_CHARS", "2000")
+    p = tmp_path / "big.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws.append(["id", "name", "value"])
+    for i in range(1, 1001):
+        ws.append([i, f"name-{i}", i * 7])
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    # Allow 20% overshoot (average-rows-per-budget heuristic) + 200 chars
+    # for the block-header + column-letters + separator lines.
+    cap = int(2000 * 1.2) + 200
+    for c in result.chunks:
+        assert len(c.render_text or "") <= cap, (
+            f"chunk over budget (2000+slack={cap}): "
+            f"{len(c.render_text)} chars on "
+            f"{c.top_left_cell}:{c.bottom_right_cell}"
+        )
+
+
+def test_child_chunk_ranges_are_contiguous_and_non_overlapping(
+    tmp_path, monkeypatch,
+):
+    monkeypatch.setenv("KS_CHUNK_BUDGET_CHARS", "2000")
+    p = tmp_path / "big.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws.append(["id", "name", "value"])
+    for i in range(1, 501):
+        ws.append([i, f"name-{i}", i * 7])
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert len(chunks_on_S) >= 2
+
+    # Sort by top row, then check contiguity / non-overlap.
+    ranges = sorted(
+        [
+            (
+                _a1_to_coord(c.top_left_cell),
+                _a1_to_coord(c.bottom_right_cell),
+            )
+            for c in chunks_on_S
+        ],
+        key=lambda x: (x[0][0], x[0][1]),
+    )
+    for (tl_a, br_a), (tl_b, br_b) in zip(ranges, ranges[1:]):
+        # No overlap on rows (since splits are by row-group)
+        assert br_a[0] < tl_b[0], (
+            f"row overlap between {tl_a}-{br_a} and {tl_b}-{br_b}"
+        )
+        # Contiguous: child B starts right after child A ends
+        assert tl_b[0] == br_a[0] + 1, (
+            f"row gap between {tl_a}-{br_a} and {tl_b}-{br_b}"
+        )
+
+
+def test_small_block_not_split(tmp_path):
+    """A 10-row table fits in one chunk; cap must not over-split."""
+    p = tmp_path / "small.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws.append(["id", "name"])
+    for i in range(1, 11):
+        ws.append([i, f"n{i}"])
+    wb.save(p)
+
+    result = parse_workbook(path=str(p))
+    chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
+    assert len(chunks_on_S) == 1, (
+        f"unexpected split on small block: "
+        f"{[(c.top_left_cell, c.bottom_right_cell) for c in chunks_on_S]}"
+    )