From efa2e0dc674b200b0792379141dbfb44f23c434a Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 8 May 2026 13:54:30 -0700 Subject: [PATCH 1/3] Handle PlanarConfiguration=2 across CPU + GPU multi-band reads Two related multi-band accuracy bugs from the geotiff audit: A2 (HIGH): GPU tiled reads of planar=2 files (separate band planes) returned wrong pixel values. gpu_decode_tiles iterates a single TileOffsets list with bytes_per_pixel = itemsize * samples and assumes chunky interleave. For planar=2, each band has its own contiguous run of tiles in TileOffsets, so the kernel read across band boundaries and produced garbage. read_geotiff_gpu now detects planar==2 and decodes each band's tile slab as a single-band image, then stacks the results into (H, W, samples). The chunky planar=1 path is unchanged. A3 (MEDIUM): The GPU stripped multi-band fallback hardcoded dims=['y','x'] even for 3-D CPU reads, mirroring the pre-#1525 bug. The same fix lands here so the planar=2 stripped tests pass; this matches the change in PR #1525 verbatim. Both paths assert the assembled shape equals (H, W, samples) before returning, so a future kernel regression surfaces immediately rather than producing garbage. Adds xrspatial/geotiff/tests/test_planar_multiband.py covering 53 combinations: planar in {separate, contig} x layout in {tiled, stripped} x bands in {2, 3, 4} x dtype in {uint8, uint16} x backend in {CPU, GPU}, plus single-band sanity checks and an explicit A3 repro. GPU tests skip cleanly when CUDA is unavailable. Seeds are fixed. Refs audit issues A2, A3. --- xrspatial/geotiff/__init__.py | 143 ++++++++++++- .../geotiff/tests/test_planar_multiband.py | 195 ++++++++++++++++++ 2 files changed, 336 insertions(+), 2 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_planar_multiband.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index f3fef41f..c8559d65 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -13,6 +13,7 @@ """ from __future__ import annotations +import math import warnings import numpy as np @@ -1396,6 +1397,92 @@ def _read(): return _read() +def _gpu_decode_single_band_tiles( + source, offsets, byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, + *, + byte_order: str, + gpu: str, + overview_level, + cupy, +): + """Decode one band's tile sequence into a 2-D ``(H, W)`` cupy array. + + Helper for the ``PlanarConfiguration=2`` GPU path: the existing + ``gpu_decode_tiles`` / ``gpu_decode_tiles_from_file`` kernels assume + a single chunky tile sequence with ``bytes_per_pixel = itemsize * + samples``. For planar=2 each band has its own list of tiles, so we + invoke those kernels once per band with ``samples=1`` and stack the + resulting 2-D arrays into ``(H, W, samples)`` afterwards. + + Mirrors the two-stage GPU pipeline in ``read_geotiff_gpu`` -- GDS + first, then CPU mmap + GPU decode -- and honours the same + ``gpu='strict'`` / ``'auto'`` semantics. Sparse tiles are not + expected here; the caller routes sparse files to the CPU path. + """ + from ._reader import _FileSource + from ._gpu_decode import gpu_decode_tiles, gpu_decode_tiles_from_file + + arr_gpu = None + try: + arr_gpu = gpu_decode_tiles_from_file( + source, offsets, byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, 1, + byte_order=byte_order, + ) + except Exception as e: + if gpu == 'strict': + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=3, + ) + arr_gpu = None + + if arr_gpu is None: + src2 = _FileSource(source) + data2 = src2.read_all() + try: + compressed_tiles = [ + bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) + for i in range(len(offsets)) + ] + finally: + src2.close() + try: + arr_gpu = gpu_decode_tiles( + compressed_tiles, + tw, th, width, height, + compression, predictor, file_dtype, 1, + byte_order=byte_order, + ) + except Exception as e: + if gpu == 'strict': + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=3, + ) + # Per-band CPU fallback would require a single-band CPU + # decode path keyed off planar config; the easier safe move + # is to surface the failure so the caller upstream can fall + # back to read_to_array for the whole image. + raise + + if arr_gpu.ndim != 2 or arr_gpu.shape != (height, width): + raise RuntimeError( + f"single-band GPU tile decode produced shape " + f"{arr_gpu.shape}, expected ({height}, {width})" + ) + return arr_gpu + + def read_geotiff_gpu(source: str, *, dtype=None, overview_level: int | None = None, @@ -1519,7 +1606,9 @@ def read_geotiff_gpu(source: str, *, target = np.dtype(dtype) _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) arr_gpu = arr_gpu.astype(target) - # Mirror the tiled branch: 3-D (y, x, band) for multi-band reads. + # Multi-band stripped reads come back as (y, x, band); mirror + # the tiled branch so dims line up with ndim. Single-band stays + # 2-D ('y', 'x'). if arr_gpu.ndim == 3: dims = ['y', 'x', 'band'] coords['band'] = np.arange(arr_gpu.shape[2]) @@ -1533,6 +1622,7 @@ def read_geotiff_gpu(source: str, *, compression = ifd.compression predictor = ifd.predictor samples = ifd.samples_per_pixel + planar = ifd.planar_config tw = ifd.tile_width th = ifd.tile_height width = ifd.width @@ -1558,7 +1648,47 @@ def read_geotiff_gpu(source: str, *, # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline; # the CPU reader fills them with nodata and copies onto the GPU. has_sparse_tile = any(bc == 0 for bc in byte_counts) - if has_sparse_tile: + + # PlanarConfiguration=2 (separate bands): each band has its own list + # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU + # tile-assembly kernel assumes a single chunky tile sequence with + # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2 + # directly. Decode each band's tile slab as a single-band image, then + # stack into (H, W, samples). For planar=1 (chunky) the existing + # single-pass kernel is correct. + if planar == 2 and samples > 1: + tiles_across = math.ceil(width / tw) + tiles_down = math.ceil(height / th) + tiles_per_band = tiles_across * tiles_down + if tiles_per_band * samples != len(offsets): + raise ValueError( + f"PlanarConfiguration=2 expects " + f"{tiles_per_band * samples} TileOffsets entries " + f"({tiles_across} x {tiles_down} x {samples} bands), " + f"got {len(offsets)}" + ) + band_arrays = [] + for band_idx in range(samples): + b0 = band_idx * tiles_per_band + b1 = b0 + tiles_per_band + band_offsets = list(offsets[b0:b1]) + band_byte_counts = list(byte_counts[b0:b1]) + band_arr = _gpu_decode_single_band_tiles( + source, band_offsets, band_byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, + byte_order=header.byte_order, + gpu=gpu, overview_level=overview_level, + cupy=cupy, + ) + band_arrays.append(band_arr) + arr_gpu = cupy.stack(band_arrays, axis=2) + # Sanity check: assembled shape must match (H, W, samples). + assert arr_gpu.shape == (height, width, samples), ( + f"planar=2 GPU assembly produced shape {arr_gpu.shape}, " + f"expected ({height}, {width}, {samples})" + ) + elif has_sparse_tile: arr_cpu, _ = read_to_array(source, overview_level=overview_level) arr_gpu = cupy.asarray(arr_cpu) else: @@ -1615,6 +1745,15 @@ def read_geotiff_gpu(source: str, *, arr_cpu, _ = read_to_array(source, overview_level=overview_level) arr_gpu = cupy.asarray(arr_cpu) + # Multi-band tiled output must be (H, W, samples) regardless of planar + # config -- catch any shape regression in the kernels before we attach + # dims/coords below. + if samples > 1: + assert arr_gpu.shape[:2] == (height, width) and arr_gpu.shape[2] == samples, ( + f"GPU multi-band tile assembly produced shape {arr_gpu.shape}, " + f"expected ({height}, {width}, {samples})" + ) + if dtype is not None: target = np.dtype(dtype) _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) diff --git a/xrspatial/geotiff/tests/test_planar_multiband.py b/xrspatial/geotiff/tests/test_planar_multiband.py new file mode 100644 index 00000000..7162fb45 --- /dev/null +++ b/xrspatial/geotiff/tests/test_planar_multiband.py @@ -0,0 +1,195 @@ +"""Tests for ``PlanarConfiguration`` handling in CPU and GPU multi-band reads. + +Covers two bugs surfaced by the geotiff accuracy audit: + +A2 (HIGH) + GPU tiled multi-band reads returned wrong pixel values because the + tile-assembly kernel iterated a single ``tile_offsets`` list with + ``bytes_per_pixel = itemsize * samples``. Planar=2 files (one tile + sequence per band, contiguous in TileOffsets) silently produced + garbage because the kernel had no notion of separate band slabs. + +A3 (MEDIUM) + The CPU stripped reader returned planar=2 multi-band shape with + axes mismatched against the assigned dims. Already fixed in the CPU + path; this module pins the contract so future refactors do not + regress it. + +Each combination -- planar config x layout x band count x dtype -- +round-trips through the writer (or tifffile, where we want to control +planar config and layout independently) and asserts that the CPU read, +the GPU read, and the original numpy buffer all agree pixel-for-pixel. + +GPU tests skip cleanly when CUDA is unavailable. +""" +from __future__ import annotations + +import importlib.util +import os +import tempfile + +import numpy as np +import pytest + + +tifffile = pytest.importorskip("tifffile") + + +def _gpu_available() -> bool: + """True if cupy is importable and CUDA is initialised.""" + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif( + not _HAS_GPU, + reason="cupy + CUDA required", +) + + +def _make_data(bands: int, height: int, width: int, dtype) -> np.ndarray: + """Deterministic random multi-band raster shaped (bands, height, width).""" + rng = np.random.RandomState(0xA2A3 + bands * 1000 + height + np.dtype(dtype).itemsize) + info = np.iinfo(dtype) + high = min(int(info.max), 60_000) + 1 + return rng.randint(0, high, size=(bands, height, width)).astype(dtype) + + +def _write_tiff(path: str, data: np.ndarray, *, planar: str, tiled: bool): + """Write *data* (bands, height, width) with the requested planar config + layout. + + tifffile defaults: when bands are first axis, planar='separate'. + Pass planar='contig' to interleave samples (data is transposed to + (height, width, bands) before write). + """ + kwargs = {"photometric": "rgb" if data.shape[0] == 3 else "minisblack"} + if data.shape[0] not in (1, 3): + # tifffile rejects 'rgb' for non-3-band; use a generic photometric + # tag and tell it to skip extra-sample warnings. + kwargs = {"photometric": "minisblack"} + if tiled: + kwargs["tile"] = (32, 32) + if planar == "separate": + kwargs["planarconfig"] = "separate" + tifffile.imwrite(path, data, **kwargs) + elif planar == "contig": + kwargs["planarconfig"] = "contig" + tifffile.imwrite(path, np.transpose(data, (1, 2, 0)), **kwargs) + else: + raise ValueError(f"unknown planar={planar!r}") + + +@pytest.mark.parametrize("planar", ["separate", "contig"]) +@pytest.mark.parametrize("tiled", [True, False]) +@pytest.mark.parametrize("bands", [2, 3, 4]) +@pytest.mark.parametrize("dtype", [np.uint8, np.uint16]) +def test_cpu_planar_multiband(planar, tiled, bands, dtype): + """CPU ``open_geotiff`` returns (y, x, band) for every planar+layout combo.""" + from xrspatial.geotiff import open_geotiff + + height, width = 64, 96 + data = _make_data(bands, height, width, dtype) + expected = np.transpose(data, (1, 2, 0)) + + with tempfile.TemporaryDirectory() as d: + path = os.path.join(d, "x.tif") + _write_tiff(path, data, planar=planar, tiled=tiled) + out = open_geotiff(path) + + assert out.dims == ("y", "x", "band"), ( + f"got dims {out.dims} for planar={planar} tiled={tiled} " + f"bands={bands} dtype={np.dtype(dtype).name}" + ) + assert out.shape == (height, width, bands) + arr = np.asarray(out.data) + assert arr.dtype == np.dtype(dtype) + np.testing.assert_array_equal(arr, expected) + + +@_gpu_only +@pytest.mark.parametrize("planar", ["separate", "contig"]) +@pytest.mark.parametrize("tiled", [True, False]) +@pytest.mark.parametrize("bands", [2, 3, 4]) +@pytest.mark.parametrize("dtype", [np.uint8, np.uint16]) +def test_gpu_matches_cpu_planar_multiband(planar, tiled, bands, dtype): + """GPU read agrees with CPU read AND with the source array.""" + from xrspatial.geotiff import open_geotiff, read_geotiff_gpu + + height, width = 64, 96 + data = _make_data(bands, height, width, dtype) + expected = np.transpose(data, (1, 2, 0)) + + with tempfile.TemporaryDirectory() as d: + path = os.path.join(d, "x.tif") + _write_tiff(path, data, planar=planar, tiled=tiled) + cpu = np.asarray(open_geotiff(path).data) + gpu_da = read_geotiff_gpu(path) + + assert gpu_da.dims == ("y", "x", "band") + assert gpu_da.shape == (height, width, bands) + gpu = gpu_da.data.get() + + np.testing.assert_array_equal(cpu, expected) + np.testing.assert_array_equal(gpu, expected) + np.testing.assert_array_equal(gpu, cpu) + + +@pytest.mark.parametrize("tiled", [True, False]) +def test_cpu_singleband_sanity(tiled): + """Single-band reads stay 2-D regardless of layout.""" + from xrspatial.geotiff import open_geotiff + + rng = np.random.RandomState(7) + data = rng.randint(0, 200, size=(48, 80)).astype(np.uint8) + with tempfile.TemporaryDirectory() as d: + path = os.path.join(d, "s.tif") + kwargs = {"photometric": "minisblack"} + if tiled: + kwargs["tile"] = (32, 32) + tifffile.imwrite(path, data, **kwargs) + out = open_geotiff(path) + assert out.dims == ("y", "x") + assert out.shape == (48, 80) + np.testing.assert_array_equal(np.asarray(out.data), data) + + +@_gpu_only +@pytest.mark.parametrize("tiled", [True, False]) +def test_gpu_singleband_sanity(tiled): + """Single-band GPU reads stay 2-D regardless of layout.""" + from xrspatial.geotiff import read_geotiff_gpu + + rng = np.random.RandomState(7) + data = rng.randint(0, 200, size=(48, 80)).astype(np.uint8) + with tempfile.TemporaryDirectory() as d: + path = os.path.join(d, "s.tif") + kwargs = {"photometric": "minisblack"} + if tiled: + kwargs["tile"] = (32, 32) + tifffile.imwrite(path, data, **kwargs) + out = read_geotiff_gpu(path) + assert out.dims == ("y", "x") + assert out.shape == (48, 80) + np.testing.assert_array_equal(out.data.get(), data) + + +def test_a3_repro_stripped_planar_separate(): + """Spec-level guard for A3: stripped planar=2 must yield (y, x, band).""" + from xrspatial.geotiff import open_geotiff + + data = _make_data(3, 64, 96, np.uint8) + with tempfile.TemporaryDirectory() as d: + path = os.path.join(d, "sp.tif") + tifffile.imwrite(path, data, photometric="rgb") # default planar=2 + out = open_geotiff(path) + assert out.dims == ("y", "x", "band") + assert out.shape == (64, 96, 3), ( + f"got shape {out.shape} for dims {out.dims} -- A3 regressed" + ) From 9563820b3f2b3918aaade96e95f6d55d8c694fd4 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sat, 9 May 2026 06:01:20 -0700 Subject: [PATCH 2/3] Address Copilot review on #1532 Seven findings, all acted on: - Sparse-tile bypass: planar=2 branch ran before the has_sparse_tile check, so a planar=2 file with sparse tiles would hit the GPU path that doesn't handle sparse blocks. Add `not has_sparse_tile` to the planar=2 gate so sparse always routes to the CPU reader. - Strict offset-count check: planar=2 required len(offsets) == tiles_per_band * samples, but validate_tile_layout only requires >=. Files with extra trailing entries that pass the CPU reader were rejected here. Relax to a minimum-length check and slice the first tiles_per_band * samples entries. - Auto-mode fallback semantics: stage-2 GPU failure in _gpu_decode_single_band_tiles raised unconditionally instead of letting the caller fall back to a whole-image CPU read like the chunky path does. Helper now returns None on stage-2 auto failure; caller catches the signal, runs read_to_array + cupy.asarray, and matches the chunky path's gpu='auto' contract. Strict mode still re-raises. - Per-band file re-reads: the helper called src.read_all() for each band's stage-2 fallback, doing N redundant full-file reads on a planar=2 image with N bands. Caller now does read_all() once before the band loop and passes the bytes via a new shared_data parameter. - Unused params: drop overview_level and cupy from the helper signature; neither was referenced. - Two assert statements for runtime shape validation replaced with explicit `if ...: raise RuntimeError(...)` so the checks survive python -O. 53 planar tests + 303 multi/band/planar/strip/tile/sparse regression tests still pass. --- xrspatial/geotiff/__init__.py | 108 +++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 42 deletions(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index c8559d65..8eb4151c 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -1398,14 +1398,12 @@ def _read(): def _gpu_decode_single_band_tiles( - source, offsets, byte_counts, + source, shared_data, offsets, byte_counts, tw, th, width, height, compression, predictor, file_dtype, *, byte_order: str, gpu: str, - overview_level, - cupy, ): """Decode one band's tile sequence into a 2-D ``(H, W)`` cupy array. @@ -1417,11 +1415,19 @@ def _gpu_decode_single_band_tiles( resulting 2-D arrays into ``(H, W, samples)`` afterwards. Mirrors the two-stage GPU pipeline in ``read_geotiff_gpu`` -- GDS - first, then CPU mmap + GPU decode -- and honours the same - ``gpu='strict'`` / ``'auto'`` semantics. Sparse tiles are not - expected here; the caller routes sparse files to the CPU path. + first, then CPU-extracted-tiles GPU decode. ``shared_data`` is the + full file bytes the caller mmapped/read once for the whole planar=2 + decode, so the second stage doesn't redo ``read_all()`` per band. + Sparse tiles are not expected here; the caller routes sparse files + to the CPU path. + + Auto-mode semantics: a stage-1 GDS failure warns and falls through + to stage 2; a stage-2 failure warns and returns ``None`` so the + caller can trigger a whole-image CPU fallback (a per-band CPU + decode would require a single-band CPU path keyed off planar + config, which doesn't exist). Strict mode re-raises from either + stage. """ - from ._reader import _FileSource from ._gpu_decode import gpu_decode_tiles, gpu_decode_tiles_from_file arr_gpu = None @@ -1444,15 +1450,10 @@ def _gpu_decode_single_band_tiles( arr_gpu = None if arr_gpu is None: - src2 = _FileSource(source) - data2 = src2.read_all() - try: - compressed_tiles = [ - bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) - for i in range(len(offsets)) - ] - finally: - src2.close() + compressed_tiles = [ + bytes(shared_data[offsets[i]:offsets[i] + byte_counts[i]]) + for i in range(len(offsets)) + ] try: arr_gpu = gpu_decode_tiles( compressed_tiles, @@ -1469,11 +1470,7 @@ def _gpu_decode_single_band_tiles( RuntimeWarning, stacklevel=3, ) - # Per-band CPU fallback would require a single-band CPU - # decode path keyed off planar config; the easier safe move - # is to surface the failure so the caller upstream can fall - # back to read_to_array for the whole image. - raise + return None if arr_gpu.ndim != 2 or arr_gpu.shape != (height, width): raise RuntimeError( @@ -1655,39 +1652,62 @@ def read_geotiff_gpu(source: str, *, # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2 # directly. Decode each band's tile slab as a single-band image, then # stack into (H, W, samples). For planar=1 (chunky) the existing - # single-pass kernel is correct. - if planar == 2 and samples > 1: + # single-pass kernel is correct. Sparse-tile files always route to + # the CPU reader regardless of planar config. + if planar == 2 and samples > 1 and not has_sparse_tile: tiles_across = math.ceil(width / tw) tiles_down = math.ceil(height / th) tiles_per_band = tiles_across * tiles_down - if tiles_per_band * samples != len(offsets): + # validate_tile_layout already requires len(offsets) >= the grid; + # accept extra trailing entries (some writers emit padding) and + # only consume the first tiles_per_band * samples. + expected_min = tiles_per_band * samples + if len(offsets) < expected_min: raise ValueError( - f"PlanarConfiguration=2 expects " - f"{tiles_per_band * samples} TileOffsets entries " - f"({tiles_across} x {tiles_down} x {samples} bands), " - f"got {len(offsets)}" + f"PlanarConfiguration=2 expects at least {expected_min} " + f"TileOffsets entries ({tiles_across} x {tiles_down} x " + f"{samples} bands), got {len(offsets)}" ) + # Read the file once for the per-band stage-2 fallback so a GDS + # failure on one band doesn't trigger N redundant read_all()s. + src2 = _FileSource(source) + try: + shared_data = src2.read_all() + finally: + src2.close() band_arrays = [] + cpu_fallback_needed = False for band_idx in range(samples): b0 = band_idx * tiles_per_band b1 = b0 + tiles_per_band band_offsets = list(offsets[b0:b1]) band_byte_counts = list(byte_counts[b0:b1]) band_arr = _gpu_decode_single_band_tiles( - source, band_offsets, band_byte_counts, + source, shared_data, band_offsets, band_byte_counts, tw, th, width, height, compression, predictor, file_dtype, byte_order=header.byte_order, - gpu=gpu, overview_level=overview_level, - cupy=cupy, + gpu=gpu, ) + if band_arr is None: + # Auto-mode signal: stage-2 GPU decode failed for this + # band. There's no per-band CPU decode path, so fall + # back to a whole-image CPU read + GPU upload, matching + # the chunky path's auto-mode semantics. + cpu_fallback_needed = True + break band_arrays.append(band_arr) - arr_gpu = cupy.stack(band_arrays, axis=2) - # Sanity check: assembled shape must match (H, W, samples). - assert arr_gpu.shape == (height, width, samples), ( - f"planar=2 GPU assembly produced shape {arr_gpu.shape}, " - f"expected ({height}, {width}, {samples})" - ) + if cpu_fallback_needed: + arr_cpu, _ = read_to_array(source, overview_level=overview_level) + arr_gpu = cupy.asarray(arr_cpu) + else: + arr_gpu = cupy.stack(band_arrays, axis=2) + if arr_gpu.shape != (height, width, samples): + raise RuntimeError( + f"planar=2 GPU assembly produced shape " + f"{arr_gpu.shape}, expected " + f"({height}, {width}, {samples})" + ) elif has_sparse_tile: arr_cpu, _ = read_to_array(source, overview_level=overview_level) arr_gpu = cupy.asarray(arr_cpu) @@ -1747,12 +1767,16 @@ def read_geotiff_gpu(source: str, *, # Multi-band tiled output must be (H, W, samples) regardless of planar # config -- catch any shape regression in the kernels before we attach - # dims/coords below. + # dims/coords below. Plain `raise` rather than `assert` so the check + # survives `python -O`. if samples > 1: - assert arr_gpu.shape[:2] == (height, width) and arr_gpu.shape[2] == samples, ( - f"GPU multi-band tile assembly produced shape {arr_gpu.shape}, " - f"expected ({height}, {width}, {samples})" - ) + if (arr_gpu.shape[:2] != (height, width) + or arr_gpu.shape[2] != samples): + raise RuntimeError( + f"GPU multi-band tile assembly produced shape " + f"{arr_gpu.shape}, expected " + f"({height}, {width}, {samples})" + ) if dtype is not None: target = np.dtype(dtype) From 5560775866d5234a2e1d0071a59c605d5b38bc3e Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sat, 9 May 2026 06:05:24 -0700 Subject: [PATCH 3/3] Switch shared file read to lazy reader The previous fix did read_all() unconditionally before the band loop to amortise a stage-2 fallback across bands. When every band's GDS path succeeds, those bytes are never used. Replace the eager read_all() with a closure that materialises the buffer on first call and caches it, so: - Every-band-GDS-succeeds case: 0 read_all() calls (was 1). - Any-band-falls-back case: exactly 1 read_all() call regardless of how many bands fall back (was N). Same Copilot-finding-#4 fix, just lazier. --- xrspatial/geotiff/__init__.py | 37 +++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 8eb4151c..19e5331a 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -1398,7 +1398,7 @@ def _read(): def _gpu_decode_single_band_tiles( - source, shared_data, offsets, byte_counts, + source, lazy_data, offsets, byte_counts, tw, th, width, height, compression, predictor, file_dtype, *, @@ -1415,9 +1415,11 @@ def _gpu_decode_single_band_tiles( resulting 2-D arrays into ``(H, W, samples)`` afterwards. Mirrors the two-stage GPU pipeline in ``read_geotiff_gpu`` -- GDS - first, then CPU-extracted-tiles GPU decode. ``shared_data`` is the - full file bytes the caller mmapped/read once for the whole planar=2 - decode, so the second stage doesn't redo ``read_all()`` per band. + first, then CPU-extracted-tiles GPU decode. ``lazy_data`` is a + zero-arg callable that returns the full file bytes; it caches its + result so the first band that needs the stage-2 fallback pays the + ``read_all()``, and subsequent bands reuse the same buffer. When + every band's GDS path succeeds the file is never read at all. Sparse tiles are not expected here; the caller routes sparse files to the CPU path. @@ -1450,6 +1452,7 @@ def _gpu_decode_single_band_tiles( arr_gpu = None if arr_gpu is None: + shared_data = lazy_data() compressed_tiles = [ bytes(shared_data[offsets[i]:offsets[i] + byte_counts[i]]) for i in range(len(offsets)) @@ -1668,13 +1671,23 @@ def read_geotiff_gpu(source: str, *, f"TileOffsets entries ({tiles_across} x {tiles_down} x " f"{samples} bands), got {len(offsets)}" ) - # Read the file once for the per-band stage-2 fallback so a GDS - # failure on one band doesn't trigger N redundant read_all()s. - src2 = _FileSource(source) - try: - shared_data = src2.read_all() - finally: - src2.close() + # Lazy shared file read for the per-band stage-2 fallback. When + # every band's GDS path succeeds, _read_once is never called + # and we skip the read_all() entirely; when any band falls + # back, the first call materialises the bytes and subsequent + # bands reuse the same buffer (so N bands cost at most one + # read_all(), not N). + _shared_data_cache: list = [] + + def _read_once(): + if not _shared_data_cache: + src2 = _FileSource(source) + try: + _shared_data_cache.append(src2.read_all()) + finally: + src2.close() + return _shared_data_cache[0] + band_arrays = [] cpu_fallback_needed = False for band_idx in range(samples): @@ -1683,7 +1696,7 @@ def read_geotiff_gpu(source: str, *, band_offsets = list(offsets[b0:b1]) band_byte_counts = list(byte_counts[b0:b1]) band_arr = _gpu_decode_single_band_tiles( - source, shared_data, band_offsets, band_byte_counts, + source, _read_once, band_offsets, band_byte_counts, tw, th, width, height, compression, predictor, file_dtype, byte_order=header.byte_order,