Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
207 changes: 203 additions & 4 deletions src/ks_xlsx_parser/chunking/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
from __future__ import annotations

import logging
import os

from ks_xlsx_parser.models.block import BlockDTO, ChunkDTO, DependencySummary
from ks_xlsx_parser.models.common import CellCoord, EdgeType
from ks_xlsx_parser.models.common import CellCoord, CellRange, EdgeType
from ks_xlsx_parser.models.sheet import SheetDTO
from ks_xlsx_parser.models.workbook import WorkbookDTO
from ks_xlsx_parser.rendering.html_renderer import HtmlRenderer
Expand All @@ -25,6 +26,166 @@
# Approximate tokens per character for English text (conservative estimate)
CHARS_PER_TOKEN = 4

# Cluster-04 chunk-size cap. A block whose rendered text exceeds this
# many chars gets split into row-group children, each with a tight A1
# range covering only its rows.
#
# Default is intentionally HIGH (effectively off for any reasonable
# workbook). Empirical reason: on the 50-sample stratified for the
# cluster-04 large-sheet pattern, every budget tight enough to actually
# fragment moderate tables (2k–4k chars) regressed at least one
# instance because the embedding cannot discriminate between
# same-shape row-group children — splitting one chunk into N
# look-alike chunks makes the embedding pick the wrong child first.
# An 8k budget was neutral (0 wins, 0 regressions). Above that no
# real-world tables we measured split. So keep tables whole by
# default; the splitter is plumbing for downstream consumers who
# need hard per-chunk token economy (e.g. an embedding model with a
# strict 512-token ceiling). They opt in via
# ``KS_CHUNK_BUDGET_CHARS=2000`` and validate against THEIR corpus.
def _budget_chars() -> int:
"""Read budget per call so tests can monkeypatch the env var."""
return int(os.environ.get("KS_CHUNK_BUDGET_CHARS", "100000"))

# Below this row count, splitting can't produce ≥2 children meaningfully
# (need at least 2 data rows to split into 2 groups).
MIN_ROWS_TO_SPLIT = 3


def _split_block_by_rows(
block: BlockDTO,
sheet: SheetDTO,
text_renderer: TextRenderer,
budget_chars: int | None = None,
) -> list[BlockDTO]:
"""Split an oversize block into contiguous row-group sub-blocks.

Returns ``[block]`` unchanged when the block fits the budget. Otherwise
returns N sub-blocks with non-overlapping, contiguous row ranges
summing to the original block's row coverage. Each sub-block carries
a copy of the parent's block-level metadata (block_type, table_name,
named_ranges, density flags) and a subset of ``key_cells`` whose rows
fall inside the slice.

Header preservation: this v1 emits row-group children WITHOUT
repeating the parent's header row(s). The cluster-04 doc calls
out "every group needs the header rows OR a header-summary block";
we accept the simpler invariant here because the alternative
(overlapping cell_range across siblings, or out-of-band header
text whose cells live outside the chunk's claimed range) breaks
cluster-02's range-tightening invariant. Header-context recovery
is a follow-up.
"""
if budget_chars is None:
budget_chars = _budget_chars()
rng = block.cell_range
n_rows = rng.bottom_right.row - rng.top_left.row + 1
if n_rows < MIN_ROWS_TO_SPLIT:
return [block]

# Render once to measure. text_renderer is fast (~ms per 1000-row block);
# we already had to render in _block_to_chunk anyway — this just moves
# the work earlier.
full_text = text_renderer.render_block(block)
if len(full_text) <= budget_chars:
return [block]

# Estimate rows-per-child from observed chars/row (averaged over the
# full render, including header overhead). Ceil to ensure children
# don't individually exceed the budget on the high end.
avg_chars_per_row = max(1.0, len(full_text) / n_rows)
rows_per_group = max(2, int(budget_chars / avg_chars_per_row))

sub_blocks: list[BlockDTO] = []
cur = rng.top_left.row
while cur <= rng.bottom_right.row:
end = min(cur + rows_per_group - 1, rng.bottom_right.row)
sub_range = CellRange(
top_left=CellCoord(row=cur, col=rng.top_left.col),
bottom_right=CellCoord(row=end, col=rng.bottom_right.col),
)
# Carry only the key_cells whose row falls inside this slice.
# Important for downstream metadata fidelity (key_cells encode
# "notable output cells" — bold, colored, etc.).
sub_key_cells = [k for k in block.key_cells if cur <= k.row <= end]
sub = BlockDTO(
block_index=block.block_index,
sheet_name=block.sheet_name,
block_type=block.block_type,
cell_range=sub_range,
bounding_box=block.bounding_box,
cell_count=block.cell_count,
formula_count=block.formula_count,
has_merges=block.has_merges,
has_formatting=block.has_formatting,
key_cells=sub_key_cells,
named_ranges=block.named_ranges,
table_name=block.table_name,
parent_block_id=block.block_id or None,
density=block.density,
label_cell_count=block.label_cell_count,
data_cell_count=block.data_cell_count,
table_structure_id=block.table_structure_id,
)
sub_blocks.append(sub)
cur = end + 1

return sub_blocks


def _tight_content_bbox(
block: BlockDTO, sheet: SheetDTO,
) -> tuple[int, int, int, int] | None:
"""Bounding box of cells with non-empty content within ``block.cell_range``.

Returns ``(r0, c0, r1, c1)`` or ``None`` if the entire range is
empty. "Non-empty" = ``raw_value`` is not None OR ``display_value``
has a non-whitespace string. Merged-slave cells count as empty
(their master is what carries the value).

Used to clip overclaiming chunk ranges before emission so that
geometric overlap scoring (cluster-02 cohort) is honest. A sheet
that has data in A1:E50 but whose segmenter handed us A1:XFD500
used to over-claim coverage of any GT cell in the empty area;
after the clip, the chunk only claims what it can actually
answer questions about.

Iterates ``sheet.cells`` rather than the dense range — many sheets
have styled-empty cells stored across XFD columns, and the dense
loop costs ~65k iterations per row for those.
"""
rng = block.cell_range
r0_, c0_ = rng.top_left.row, rng.top_left.col
r1_, c1_ = rng.bottom_right.row, rng.bottom_right.col

min_row = min_col = None
max_row = max_col = None
# sheet.cells is dict[str, CellDTO] keyed "row,col"; iterate values
# and read coord off the DTO. Avoids iterating the dense range when
# styled-empty cells span XFD width (16384 cols).
for cell in sheet.cells.values():
r, c = cell.coord.row, cell.coord.col
if not (r0_ <= r <= r1_ and c0_ <= c <= c1_):
continue
if cell.is_merged_slave:
continue
if cell.raw_value is None and not (
cell.display_value and cell.display_value.strip()
):
continue
if min_row is None or r < min_row:
min_row = r
if max_row is None or r > max_row:
max_row = r
if min_col is None or c < min_col:
min_col = c
if max_col is None or c > max_col:
max_col = c

if min_row is None:
return None
return (min_row, min_col, max_row, max_col)


class ChunkBuilder:
"""
Expand Down Expand Up @@ -92,6 +253,23 @@ def build_all(self) -> list[ChunkDTO]:
html_renderer = HtmlRenderer(sheet)
text_renderer = TextRenderer(sheet)

# Cluster-04 size cap: replace oversize blocks with row-group
# children before chunk emission. Finalize the children with
# the workbook hash so their block_ids are stable IDs.
expanded_blocks: list[BlockDTO] = []
for block in blocks:
children = _split_block_by_rows(block, sheet, text_renderer)
if len(children) > 1:
# Re-finalize each child with the workbook hash so its
# block_id is deterministic. (The parent kept its own
# block_id from the earlier finalize; children inherit
# parent_block_id and get a fresh ID off the child's
# narrower cell_range.)
for child in children:
child.finalize(self._workbook.workbook_hash)
expanded_blocks.extend(children)
blocks = expanded_blocks

for block in blocks:
chunk = self._block_to_chunk(
block, sheet, html_renderer, text_renderer
Expand Down Expand Up @@ -154,12 +332,33 @@ def _block_to_chunk(
for coord in block.key_cells
]

# Cluster-02 invariant: the chunk's claimed A1 range must be a
# tight bbox over cells with content. Overclaiming inflates
# geometric recall for the parser but lies to downstream
# consumers (a UI citation that highlights an empty area).
# We clip but never widen — the renderer already only outputs
# cells inside `block.cell_range`, so a narrowed range still
# contains every cell that contributed to `render_text`.
tight = _tight_content_bbox(block, sheet)
if tight is not None:
r0, c0, r1, c1 = tight
chunk_range = CellRange(
top_left=CellCoord(row=r0, col=c0),
bottom_right=CellCoord(row=r1, col=c1),
)
else:
# Block has zero non-empty cells (styled-empty cells across
# XFD columns is a real shape on this corpus). Claim only the
# top-left cell so the chunk is honest about being empty.
tl = block.cell_range.top_left
chunk_range = CellRange(top_left=tl, bottom_right=tl)

return ChunkDTO(
sheet_name=block.sheet_name,
block_type=block.block_type,
top_left_cell=block.cell_range.top_left.to_a1(),
bottom_right_cell=block.cell_range.bottom_right.to_a1(),
cell_range=block.cell_range,
top_left_cell=chunk_range.top_left.to_a1(),
bottom_right_cell=chunk_range.bottom_right.to_a1(),
cell_range=chunk_range,
key_cells=key_cells,
named_ranges=block.named_ranges,
dependency_summary=dep_summary,
Expand Down
143 changes: 143 additions & 0 deletions tests/test_chunker_range_tighten.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""Range-tightening invariant for chunk emission.

Cluster-02 closes when every chunk's claimed (top_left, bottom_right)
range is the bounding box of cells with non-empty content within
``block.cell_range``. Today the chunker copies ``block.cell_range``
verbatim — when the segmenter hands it a block whose range is wider
than the data (empty rows/cols at the edges), the chunk overclaims
geometric coverage.

These tests assert the invariant on real fixtures + a hand-built
"data in the upper-left corner of a wider block" workbook.
"""
from __future__ import annotations

import openpyxl

from ks_xlsx_parser.api import parse_workbook
from ks_xlsx_parser.models.common import addr_to_a1


def _a1_to_coord(a1: str) -> tuple[int, int]:
"""Inverse of addr_to_a1 — small helper for tests only."""
letters = "".join(c for c in a1 if c.isalpha())
digits = "".join(c for c in a1 if c.isdigit())
col = 0
for ch in letters.upper():
col = col * 26 + (ord(ch) - ord("A") + 1)
return int(digits), col


def test_chunk_range_tight_around_actual_content(tmp_path):
"""A block whose range covers A1:E20 but only has data in A1:C5
must emit a chunk that claims A1:C5 (or tighter), not A1:E20."""
p = tmp_path / "sparse.xlsx"
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "S"
# Fill only the upper-left 5×3 region; rest of the block is empty.
ws["A1"] = "Name"
ws["B1"] = "Q1"
ws["C1"] = "Q2"
for i, name in enumerate(["Alpha", "Bravo", "Charlie", "Delta"], start=2):
ws.cell(row=i, column=1, value=name)
ws.cell(row=i, column=2, value=i * 10)
ws.cell(row=i, column=3, value=i * 20)
wb.save(p)

result = parse_workbook(path=str(p))
chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
assert chunks_on_S, "no chunks emitted on S"
for c in chunks_on_S:
r1, c1 = _a1_to_coord(c.bottom_right_cell)
# Bottom-right of the claimed range must not exceed the data bbox.
assert r1 <= 5, (
f"chunk over-claims rows: {c.top_left_cell}:{c.bottom_right_cell} "
f"but data ends at row 5"
)
assert c1 <= 3, (
f"chunk over-claims cols: {c.top_left_cell}:{c.bottom_right_cell} "
f"but data ends at col C"
)


def test_chunk_range_unchanged_when_block_is_already_tight(tmp_path):
"""If block.cell_range already matches the data bbox, the chunk's
claimed range must be identical — no spurious narrowing."""
p = tmp_path / "dense.xlsx"
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "S"
ws["A1"] = "h1"
ws["B1"] = "h2"
ws["A2"] = 1
ws["B2"] = 2
wb.save(p)

result = parse_workbook(path=str(p))
chunks_on_S = [c for c in result.chunks if c.sheet_name == "S"]
assert chunks_on_S
# The single dense block should claim A1:B2 exactly.
found_tight = any(
c.top_left_cell == "A1" and c.bottom_right_cell == "B2"
for c in chunks_on_S
)
assert found_tight, [
(c.top_left_cell, c.bottom_right_cell) for c in chunks_on_S
]


def test_chunk_range_invariant_holds_on_corpus_fixture():
"""End-to-end on a real SpreadsheetBench input — every emitted
chunk's range must be a tight bbox over cells with content.
Uses one of the 50-sample sheets; skipped if corpus absent."""
import os
p = (
"data/corpora/spreadsheetbench/all_data_912_v0.1"
"/spreadsheet/54105/1_54105_input.xlsx"
)
if not os.path.exists(p):
import pytest
pytest.skip(f"corpus fixture not present: {p}")

result = parse_workbook(path=p)
import openpyxl
wb = openpyxl.load_workbook(p, data_only=True)

for c in result.chunks:
if not c.top_left_cell or not c.bottom_right_cell:
continue
ws = wb[c.sheet_name]
r0, c0 = _a1_to_coord(c.top_left_cell)
r1, c1 = _a1_to_coord(c.bottom_right_cell)
# The CORNER rows/cols of the claimed range must each contain
# some non-empty cell — otherwise the range is over-claiming.
def _row_has_content(r: int) -> bool:
for col in range(c0, c1 + 1):
v = ws.cell(row=r, column=col).value
if v is not None and str(v).strip():
return True
return False

def _col_has_content(col: int) -> bool:
for r in range(r0, r1 + 1):
v = ws.cell(row=r, column=col).value
if v is not None and str(v).strip():
return True
return False

# Allow chart anchors (single-cell, no real content) to pass.
if r0 == r1 and c0 == c1:
continue
assert _row_has_content(r0), (
f"top row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
)
assert _row_has_content(r1), (
f"bottom row of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
)
assert _col_has_content(c0), (
f"left col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
)
assert _col_has_content(c1), (
f"right col of chunk {c.top_left_cell}:{c.bottom_right_cell} is empty"
)
Loading