From 98dd96ef98eb1cae609d93d80c4cc0c0f7696f9d Mon Sep 17 00:00:00 2001
From: Arnav Goel <arnavgoel2@gmail.com>
Date: Wed, 20 May 2026 03:46:07 -0400
Subject: [PATCH] renderer(tier-1): row anchors + number-format expansion +
 merged-cell propagation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three changes to text_renderer.render_block(), each addressing a
distinct way the parser/chunker was losing signal between the workbook
and the chunk's render_text. Together they move the parser-quality
metric (rank IS NOT None — answer surfaced in some chunk's text) by
+0.6 pp on the full 912 SpreadsheetBench v0.1, and lift text@5 by
+1.0 pp / text@5_in_scope by +0.9 pp on the same run.

1. Row-number anchors

   Every data row of the markdown grid now carries an `r<N>` prefix
   where N is the sheet row (1-indexed):

       [Sheet1!A1:D10] (table)
            | A    | B   | C   | D    |
            |------|-----|-----|------|
       r1   | Name | Q1  | Q2  | Q3   |
       r2   | Wgt  | 100 | 150 | 200  |

   A downstream LLM consuming the chunk can now compute cell
   coordinates deterministically: the block header gives the A1
   range; per-row anchors close the gap to (row, col). Citation-
   grade output for the agent-side use cases on ks-backend.

2. Number-format-aware rendering

   When a cell's number_format produces a meaningfully-different
   displayed string (0.06 → "6%", 1272 → "1,272.00", 46022 → date),
   we now emit both:

       r2   | 0.06 [6%] | 1272 [1,272.00] |

   Substring-search retrieval hits either form — the question may
   quote the raw or the displayed, and answer.xlsx may use the
   display form even though input.xlsx keeps the raw.

   Trivial diffs (1272 → "1272.00", "1272.0") are NOT expanded — no
   information added, only noise.

3. Merged-cell value propagation

   Slave cells in a merged region currently render blank because
   openpyxl returns None for them. That kills text-match retrieval
   whenever a question references the cell by a slave coordinate.
   Renderer now looks up the master and emits the master's value
   at each slave with a `← ` propagation marker:

       r1   | Total | ← Total | ← Total |

   The merged region's visible value now appears at every position
   it appears in Excel, not just the top-left.

Bench (full 912 / text-embedding-3-large):
  parser-quality (rank IS NOT None): 0.843 → 0.849 (+4 instances)
  recall_text@5:                     0.750 → 0.760 (+0.010)
  recall_text@5_in_scope:            0.838 → 0.847 (+0.009)
  recall_geometric@5:                0.482 → 0.484 (no real change)
  mean parse_ms:                     156   → 184    (+27 ms)
  per-instance:                      6 miss→hit, 0 hit→miss

Tests: tests/test_renderer_tier1.py — 7 cases (row anchor presence
+ correct sheet-row indexing, percent/decimal format expansion,
trivial-diff suppression, merged-cell propagation + sanity).
1079 → 1086 total passing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ks_xlsx_parser/rendering/text_renderer.py | 142 ++++++++++---
 tests/test_renderer_tier1.py                  | 200 ++++++++++++++++++
 2 files changed, 308 insertions(+), 34 deletions(-)
 create mode 100644 tests/test_renderer_tier1.py
diff --git a/src/ks_xlsx_parser/rendering/text_renderer.py b/src/ks_xlsx_parser/rendering/text_renderer.py
index e2c15f4..1afc2e5 100644
--- a/src/ks_xlsx_parser/rendering/text_renderer.py
+++ b/src/ks_xlsx_parser/rendering/text_renderer.py
@@ -13,7 +13,7 @@
 
 from ks_xlsx_parser.models.block import BlockDTO
 from ks_xlsx_parser.models.chart import ChartDTO
-from ks_xlsx_parser.models.common import BlockType, col_number_to_letter
+from ks_xlsx_parser.models.common import BlockType, CellCoord, col_number_to_letter
 from ks_xlsx_parser.models.sheet import SheetDTO
 
 logger = logging.getLogger(__name__)
@@ -57,9 +57,15 @@ def _format_number_for_retrieval(raw: int | float) -> str:
 def _cell_render_value(cell) -> str:
     """Pick the string form of `cell` that's best for RAG retrieval.
 
-    For *numeric* cells we ignore the display-formatted string and emit
-    the raw value verbatim — Excel's commas, percent signs, trailing
-    zeros, and currency symbols all defeat substring search.
+    For *numeric* cells we emit the raw value verbatim — Excel's
+    commas, percent signs, trailing zeros, and currency symbols all
+    defeat substring search. **When the cell carries a number-format
+    that meaningfully changes the displayed string (e.g. 0.06 → "6%",
+    1272 → "1,272.00", 46022 → "2025-12-31"), we additionally append
+    the formatted form in `[brackets]`** so substring match hits
+    either the raw or the displayed shape — the question may quote
+    either, and `answer.xlsx` may use the display form even though
+    `input.xlsx` keeps the raw.
 
     For dates we emit ISO ``YYYY-MM-DD`` (no time component) which is
     both human-readable and matches the date format that openpyxl /
@@ -79,7 +85,15 @@ def _cell_render_value(cell) -> str:
         return raw.isoformat()
 
     if isinstance(raw, (int, float)) and not isinstance(raw, bool):
-        return _format_number_for_retrieval(raw)
+        raw_str = _format_number_for_retrieval(raw)
+        # If a meaningful number-format produced a different display
+        # string, emit both forms. Skip when the displayed form is
+        # identical to the raw (no information added) or trivially
+        # convertible (just trailing zeros), to keep render_text terse.
+        disp = (cell.display_value or "").strip()
+        if disp and disp != raw_str and not _is_trivial_format_diff(raw_str, disp):
+            return f"{raw_str} [{disp}]"
+        return raw_str
 
     if cell.display_value is not None:
         return str(cell.display_value)
@@ -88,6 +102,28 @@ def _cell_render_value(cell) -> str:
     return ""
 
 
+def _is_trivial_format_diff(raw_str: str, display_str: str) -> bool:
+    """True if `display_str` adds no retrieval value over `raw_str`.
+
+    Trivial: ``"1272"`` → ``"1272.0"`` / ``"1272.00"`` (trailing zeros
+    only, no other formatting change). The displayed form contributes
+    no new tokens substring-search could hit.
+
+    NOT trivial: ``"1272"`` → ``"1,272.00"`` (thousands separator), or
+    ``"0.06"`` → ``"6%"``, or ``"1272"`` → ``"$1,272"``. Each of these
+    surfaces a distinct token a user might quote.
+    """
+    if raw_str == display_str:
+        return True
+    # Trim trailing zeros after a decimal point on the displayed form.
+    # If what remains equals the raw, the only difference was insignificant.
+    if "." in display_str:
+        head, tail = display_str.split(".", 1)
+        if tail.rstrip("0") == "" and head == raw_str:
+            return True
+    return False
+
+
 class TextRenderer:
     """
     Renders blocks as plain text with coordinate context.
@@ -105,11 +141,17 @@ def render_block(self, block: BlockDTO) -> str:
 
         Format:
             [Sheet1!A1:D10] (table: "SalesData")
-            | A        | B       | C      | D       |
-            |----------|---------|--------|---------|
-            | Product  | Q1      | Q2     | Q3      |
-            | Widget A | 100     | 150    | 200     |
+                  | A        | B       | C      | D       |
+                  |----------|---------|--------|---------|
+            r1    | Product  | Q1      | Q2     | Q3      |
+            r2    | Widget A | 100     | 150    | 200     |
             ...
+
+        Per-row `r<N>` prefix carries the sheet row number so a
+        downstream LLM consumer can compute cell coordinates
+        deterministically (block header gives the A1 range; per-row
+        anchors close the gap to (row, col)). The prefix width is
+        sized to the largest row number in the block.
         """
         rng = block.cell_range
         rows = range(rng.top_left.row, rng.bottom_right.row + 1)
@@ -124,35 +166,73 @@ def render_block(self, block: BlockDTO) -> str:
             header += f' table: "{block.table_name}"'
         lines.append(header)
 
+        # Row-anchor width — `r<N>` plus padding. Sized once per block
+        # so all rows align under a constant-width column.
+        row_anchor_width = max(len(f"r{r}") for r in rows)
+        row_anchor_pad = " " * row_anchor_width  # blank slot for header / separator
+
+        # Build slave→master lookup for merged regions on this sheet.
+        # Slave cells (everything in a merged range except the master)
+        # render the master's value with a `←` propagation marker so
+        # the chunk's text contains the visible value at every position
+        # it appears in Excel, not just the top-left of the region.
+        merged_master: dict[tuple[int, int], CellCoord] = {}
+        for region in self._sheet.merged_regions:
+            mr = region.range
+            master = region.master
+            for r in range(mr.top_left.row, mr.bottom_right.row + 1):
+                for c in range(mr.top_left.col, mr.bottom_right.col + 1):
+                    if r == master.row and c == master.col:
+                        continue
+                    merged_master[(r, c)] = master
+
+        def _value_for(row: int, col: int) -> tuple[str, bool]:
+            """Return (rendered string, is_propagated_from_master)."""
+            cell = self._sheet.get_cell(row, col)
+            if cell is not None and not cell.is_merged_slave:
+                val = _cell_render_value(cell)
+                if cell.formula and not val.startswith("="):
+                    val = f"{val} [=]"
+                return _flatten_cell_text(val), False
+            # Slave: propagate the master's value.
+            master = merged_master.get((row, col))
+            if master is None:
+                return "", False
+            master_cell = self._sheet.get_cell(master.row, master.col)
+            if master_cell is None:
+                return "", False
+            mval = _cell_render_value(master_cell)
+            if master_cell.formula and not mval.startswith("="):
+                mval = f"{mval} [=]"
+            return _flatten_cell_text(f"← {mval}"), True
+
         # Compute column widths using the SAME rendering rules the data
-        # rows will use, including the trailing `[=]` formula marker.
-        # Otherwise `[=]` inflates a cell past col_width post-hoc and
-        # spuriously triggers the long-value fallback below.
+        # rows will use, including the trailing `[=]` formula marker
+        # AND the merged-cell `←` propagation marker. Otherwise these
+        # inflate a cell past col_width post-hoc and spuriously trigger
+        # the long-value fallback below.
         col_widths: dict[int, int] = {}
         for col in cols:
             col_letter = col_number_to_letter(col)
             max_width = len(col_letter)
             for row in rows:
-                cell = self._sheet.get_cell(row, col)
-                if cell is None:
-                    continue
-                val = _cell_render_value(cell)
-                if cell.formula and not val.startswith("="):
-                    val = f"{val} [=]"
-                val = _flatten_cell_text(val)
+                val, _ = _value_for(row, col)
                 max_width = max(max_width, len(val))
             col_widths[col] = min(max_width, 30)  # Cap at 30 for alignment; text may overflow
 
-        # Column header row
+        # Column header row — leading blank slot matches the row-anchor width.
         col_headers = []
         for col in cols:
             if col in self._sheet.hidden_cols:
                 continue
             letter = col_number_to_letter(col)
             col_headers.append(letter.ljust(col_widths[col]))
-        lines.append("| " + " | ".join(col_headers) + " |")
+        lines.append(row_anchor_pad + " | " + " | ".join(col_headers) + " |")
         lines.append(
-            "|-" + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols) + "-|"
+            row_anchor_pad
+            + " |-"
+            + "-|-".join("-" * col_widths[c] for c in cols if c not in self._sheet.hidden_cols)
+            + "-|"
         )
 
         # Data rows
@@ -161,20 +241,13 @@ def render_block(self, block: BlockDTO) -> str:
             if row in self._sheet.hidden_rows:
                 continue
 
+            anchor = f"r{row}".ljust(row_anchor_width)
+
             values = []
             for col in cols:
                 if col in self._sheet.hidden_cols:
                     continue
-                cell = self._sheet.get_cell(row, col)
-                val = _cell_render_value(cell) if cell else ""
-
-                if cell and cell.formula and not val.startswith("="):
-                    val = f"{val} [=]"
-
-                # Markdown table rows are single-line; collapse embedded newlines
-                # (common in headers like "租金\n天数") so they don't break the grid.
-                val = _flatten_cell_text(val)
-
+                val, _ = _value_for(row, col)
                 # Long-value fallback: only triggers if the rendered string
                 # genuinely exceeds the (now consistently-computed) column
                 # width — i.e. the column was capped at 30. We still emit
@@ -182,7 +255,7 @@ def render_block(self, block: BlockDTO) -> str:
                 # alignment overflow; truncating destroys retrievability.
                 values.append(val.ljust(col_widths[col]))
 
-            line = "| " + " | ".join(values) + " |"
+            line = anchor + " | " + " | ".join(values) + " |"
             lines.append(line)
 
             # Add separator after first row if it looks like a header
@@ -191,7 +264,8 @@ def render_block(self, block: BlockDTO) -> str:
                 BlockType.ASSUMPTIONS_TABLE,
             ):
                 lines.append(
-                    "|-"
+                    row_anchor_pad
+                    + " |-"
                     + "-|-".join(
                         "-" * col_widths[c]
                         for c in cols
diff --git a/tests/test_renderer_tier1.py b/tests/test_renderer_tier1.py
new file mode 100644
index 0000000..f64c95f
--- /dev/null
+++ b/tests/test_renderer_tier1.py
@@ -0,0 +1,200 @@
+"""Tier-1 chunker-quality improvements in the text renderer.
+
+Three changes, all in `text_renderer.render_block`:
+
+1. Row-number anchors — each data row of the markdown grid carries
+   its sheet row number, so an LLM consuming the chunk can compute
+   `(row, col)` cell coordinates deterministically. The block header
+   already prints the A1 range; per-row anchors close the gap.
+
+2. Number-format-aware value rendering — Excel cells store a raw
+   value (`0.06`, `46022`) but DISPLAY a formatted form (`6%`,
+   `2025-12-31`) per the cell's `number_format`. When the two differ
+   meaningfully, render BOTH so substring-match retrieval hits either
+   form (the question may quote either, and `answer.xlsx` may use
+   the display form even though `input.xlsx` keeps the raw).
+
+3. Merged-cell value propagation — slave cells in a merged region
+   currently render as blank because openpyxl returns `None` for
+   them. That kills text-match retrieval whenever a question
+   references the cell by a slave coordinate. Render the master's
+   value at each slave with a `←` marker indicating propagation.
+
+Each test uses an openpyxl-built fixture and asserts on the chunk's
+`render_text` end-to-end (via `parse_workbook`) so the change is
+visible to the downstream consumer that matters.
+"""
+from __future__ import annotations
+
+import openpyxl
+
+from ks_xlsx_parser.api import parse_workbook
+
+
+def _all_text(workbook_path) -> str:
+    result = parse_workbook(path=str(workbook_path))
+    return "\n".join(c.render_text or "" for c in result.chunks)
+
+
+# ────────────────────────────────────────────────── #1 row anchors
+
+
+def test_row_anchor_appears_for_each_data_row(tmp_path):
+    """Every non-hidden data row gets a `r<N>` prefix where N is the
+    sheet row number (1-indexed), not the 0-indexed position within
+    the block."""
+    p = tmp_path / "anchors.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "name"
+    ws["B1"] = "qty"
+    ws["A2"] = "Widget"
+    ws["B2"] = 100
+    ws["A3"] = "Hub"
+    ws["B3"] = 200
+    wb.save(p)
+
+    text = _all_text(p)
+    # Every row from the block should be addressable.
+    assert "r1" in text, f"missing row-1 anchor; chunk text was:\n{text}"
+    assert "r2" in text, f"missing row-2 anchor; chunk text was:\n{text}"
+    assert "r3" in text, f"missing row-3 anchor; chunk text was:\n{text}"
+    # The anchors should be visibly bound to the right values — `r2`
+    # should appear ahead of `Widget` on the same line.
+    widget_line = next(
+        (line for line in text.splitlines() if "Widget" in line), None,
+    )
+    assert widget_line is not None and "r2" in widget_line, (
+        f"row anchor not on the same line as its data: "
+        f"line={widget_line!r}"
+    )
+
+
+def test_row_anchor_uses_sheet_row_not_chunk_offset(tmp_path):
+    """A block that starts at row 5 must anchor its first data row as
+    `r5`, not `r1`. Citation depends on this."""
+    p = tmp_path / "offset.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    # Leave rows 1-4 empty (no header lines, no data); block starts at 5.
+    ws["A5"] = "name"
+    ws["B5"] = "qty"
+    ws["A6"] = "Widget"
+    ws["B6"] = 100
+    wb.save(p)
+
+    text = _all_text(p)
+    assert "r5" in text, f"missing r5 anchor:\n{text}"
+    assert "r6" in text, f"missing r6 anchor:\n{text}"
+
+
+# ────────────────────────────────────────────────── #2 number-format expansion
+
+
+def test_percent_format_renders_both_forms(tmp_path):
+    """A cell storing 0.06 with number_format '0%' must render both
+    `0.06` (raw) AND `6%` (displayed). Substring match hits either."""
+    p = tmp_path / "percent.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "rate"
+    ws["A2"] = 0.06
+    ws["A2"].number_format = "0%"
+    wb.save(p)
+
+    text = _all_text(p)
+    assert "0.06" in text, f"raw form missing:\n{text}"
+    assert "6%" in text, f"displayed form missing:\n{text}"
+
+
+def test_decimal_format_renders_both_forms(tmp_path):
+    """Format `#,##0.00` should add the formatted form (`1,272.00`)
+    alongside the raw `1272`."""
+    p = tmp_path / "decimal.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "amount"
+    ws["A2"] = 1272
+    ws["A2"].number_format = "#,##0.00"
+    wb.save(p)
+
+    text = _all_text(p)
+    assert "1272" in text, f"raw form missing:\n{text}"
+    assert "1,272.00" in text, f"comma-format missing:\n{text}"
+
+
+def test_no_format_expansion_when_format_is_general(tmp_path):
+    """When number_format is 'General' (the default), do NOT add a
+    redundant '(value)' clause — the raw is the display."""
+    p = tmp_path / "general.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "amount"
+    ws["A2"] = 1272  # default General format
+    wb.save(p)
+
+    text = _all_text(p)
+    assert "1272" in text
+    # No bracketed duplicate of the raw form.
+    assert "1272 [1272]" not in text
+
+
+# ────────────────────────────────────────────────── #3 merged-cell propagation
+
+
+def test_merged_cell_master_value_propagates_to_slaves(tmp_path):
+    """`ws.merge_cells('A1:C1')` with A1='Total' should render the
+    string 'Total' at A1 AND a propagation marker at B1 and C1 so the
+    full grid carries the visible value at every position it appears
+    in Excel."""
+    p = tmp_path / "merged.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "Total"
+    ws.merge_cells("A1:C1")
+    ws["A2"] = 1
+    ws["B2"] = 2
+    ws["C2"] = 3
+    wb.save(p)
+
+    text = _all_text(p)
+    # The master value must be present at least once.
+    assert "Total" in text, f"master value missing:\n{text}"
+    # The propagated form contains the master's value with a marker.
+    # We accept either "← Total" or any text containing both 'Total'
+    # and a propagation indicator on the slave column's cell.
+    header_row = next(
+        (l for l in text.splitlines() if "Total" in l), None,
+    )
+    assert header_row is not None, "no row contains 'Total'"
+    # Count occurrences of the string `Total` on that row — should be
+    # ≥ 2 (master + at least one slave) for the propagation to be
+    # observable.
+    assert header_row.count("Total") >= 2, (
+        f"expected 'Total' to repeat across the merged region; "
+        f"row was:\n{header_row}"
+    )
+
+
+def test_unmerged_cells_render_normally(tmp_path):
+    """Sanity check: a workbook with no merged regions must produce
+    the same shape as before this change (no spurious markers)."""
+    p = tmp_path / "no_merge.xlsx"
+    wb = openpyxl.Workbook()
+    ws = wb.active
+    ws.title = "S"
+    ws["A1"] = "name"
+    ws["B1"] = "qty"
+    ws["A2"] = "Widget"
+    ws["B2"] = 100
+    wb.save(p)
+
+    text = _all_text(p)
+    # No propagation marker should appear because nothing is merged.
+    assert "←" not in text, f"unexpected propagation marker:\n{text}"