Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
-----------


### Unreleased

#### Bug fixes and improvements
- Decode TIFF predictor=2 sample-wise so multi-byte integer files written by libtiff/GDAL read back exactly


### Version 0.9.9 - 2026-05-05

#### Bug fixes and improvements
Expand Down
160 changes: 122 additions & 38 deletions xrspatial/geotiff/_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,80 +333,164 @@ def lzw_compress(data: bytes) -> bytes:


# -- Horizontal predictor (Numba) --------------------------------------------
#
# TIFF predictor=2 (horizontal differencing) operates on whole *samples*
# (per-pixel-component values), not on individual bytes. For multi-byte
# integer samples the difference is computed across full sample values, so
# carries between the low and high bytes are honoured. The byte stream that
# lands on disk is the difference samples re-encoded in the file's byte
# order (always little-endian for files this writer emits, but readers must
# also handle big-endian sources).
#
# The previous implementation here did byte-wise cumulative sum at a sample
# stride, which produced the right answer for 8-bit data but silently
# scrambled multi-byte integers whenever the difference between adjacent
# samples crossed a byte boundary (i.e. needed a carry). Both halves were
# wrong in matching ways, so files this codebase wrote could be re-read,
# but cross-tool reads of files written by libtiff/GDAL produced incorrect
# pixel values. See accuracy sweep finding (2026-05-06).

@ngjit
def _predictor_decode(data, width, height, bytes_per_sample):
"""Undo horizontal differencing predictor (TIFF predictor=2).
def _predictor_decode_u8(data, row_len, height, stride):
"""Undo horizontal differencing on a byte stream (in-place, mod 256).

Operates in-place on the flat byte array, performing cumulative sum
per row at the sample level.
``row_len`` is the full row length in bytes; ``stride`` is the number
of bytes between adjacent same-channel samples (e.g. samples_per_pixel
for chunky 8-bit multi-band data).
"""
row_bytes = width * bytes_per_sample
for row in range(height):
row_start = row * row_bytes
for col in range(bytes_per_sample, row_bytes):
row_start = row * row_len
for col in range(stride, row_len):
idx = row_start + col
data[idx] = np.uint8((np.int32(data[idx]) + np.int32(data[idx - bytes_per_sample])) & 0xFF)
data[idx] = np.uint8(
(np.int32(data[idx]) + np.int32(data[idx - stride])) & 0xFF)


@ngjit
def _predictor_encode(data, width, height, bytes_per_sample):
"""Apply horizontal differencing predictor (TIFF predictor=2).

Operates in-place, converting absolute values to differences.
Process right-to-left to avoid overwriting values we still need.
"""
row_bytes = width * bytes_per_sample
def _predictor_encode_u8(data, row_len, height, stride):
"""Apply horizontal differencing on a byte stream (in-place, mod 256)."""
for row in range(height):
row_start = row * row_bytes
for col in range(row_bytes - 1, bytes_per_sample - 1, -1):
row_start = row * row_len
for col in range(row_len - 1, stride - 1, -1):
idx = row_start + col
data[idx] = np.uint8((np.int32(data[idx]) - np.int32(data[idx - bytes_per_sample])) & 0xFF)
data[idx] = np.uint8(
(np.int32(data[idx]) - np.int32(data[idx - stride])) & 0xFF)


def _predictor_decode_samples(data, image_width, height, dtype, samples=1):
"""Sample-wise decode for multi-byte sample dtypes.

Modifies *data* (a flat uint8 buffer) in place. ``dtype`` carries the
file's byte order so that the buffer is interpreted with the right
endianness; the result is written back in the same byte order.

For chunky multi-band data the cumulative sum runs per-channel so that
each channel's value is differenced against the previous same-channel
pixel (TIFF Technical Note 1, predictor=2).
"""
if samples == 1:
view = data.view(dtype).reshape(height, image_width)
else:
view = data.view(dtype).reshape(height, image_width, samples)
# In-place cumsum in the dtype's modular arithmetic. The ``out`` cast
# keeps the result wrapping inside the sample width for fixed-width
# integers. Float dtypes never reach this branch (the public
# ``predictor_decode`` routes them to the byte-wise legacy kernel)
# because predictor=2 differencing on floats does not invert exactly.
np.cumsum(view, axis=1, dtype=dtype, out=view)


def _predictor_encode_samples(data, image_width, height, dtype, samples=1):
"""Sample-wise encode for multi-byte sample dtypes (in-place)."""
if samples == 1:
view = data.view(dtype).reshape(height, image_width)
else:
view = data.view(dtype).reshape(height, image_width, samples)
view[:, 1:] = np.diff(view, axis=1)


def predictor_decode(data: np.ndarray, width: int, height: int,
bytes_per_sample: int) -> np.ndarray:
bytes_per_sample: int,
dtype: np.dtype | None = None,
samples: int = 1) -> np.ndarray:
"""Undo horizontal differencing predictor (predictor=2).

Parameters
----------
data : np.ndarray
Flat uint8 array of decompressed pixel data (modified in-place).
width, height : int
Image dimensions.
width : int
Image width in pixels. For chunky multi-band data the number of
samples per row is ``width * samples``.
height : int
Number of rows.
bytes_per_sample : int
Bytes per sample (e.g. 1 for uint8, 4 for float32).
Bytes per sample (1, 2, 4, or 8).
dtype : np.dtype, optional
Sample dtype including the file's byte order. Required when
``bytes_per_sample > 1`` so the byte stream can be interpreted
as whole sample values; per TIFF spec, differencing is
sample-level (carries propagate within each sample's bytes).
samples : int
Samples per pixel for chunky planar config.

Returns
-------
np.ndarray
Same array, modified in-place.

Notes
-----
Predictor=2 differences each sample against the previous same-channel
sample (TIFF Tech Note 1). For chunky multi-band data the byte
stride between same-channel samples is ``samples * bytes_per_sample``.
"""
buf = np.ascontiguousarray(data)
_predictor_decode(buf, width, height, bytes_per_sample)
if bytes_per_sample == 1:
# Byte-wise math is exact for 8-bit samples; stride = channels.
_predictor_decode_u8(buf, width * samples, height, samples)
elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
_predictor_decode_samples(buf, width, height, np.dtype(dtype),
samples=samples)
elif dtype is not None and np.dtype(dtype).kind == 'f':
# Predictor=2 on floats is outside the TIFF spec (which restricts
# this predictor to integers). xrspatial historically allowed it
# with byte-wise differencing, which round-trips because the byte
# sequence is restored exactly even though the per-sample
# arithmetic is meaningless. Preserve that behaviour here so
# files written by older versions still read back unchanged.
_predictor_decode_u8(buf, width * samples * bytes_per_sample,
height, samples * bytes_per_sample)
else:
# Legacy path: byte-wise with caller-supplied stride. Only correct
# for round-trips with a matching byte-wise encoder; do not feed
# this path libtiff/GDAL files for multi-byte sample data.
_predictor_decode_u8(buf, width * bytes_per_sample, height,
bytes_per_sample)
return buf


def predictor_encode(data: np.ndarray, width: int, height: int,
bytes_per_sample: int) -> np.ndarray:
bytes_per_sample: int,
dtype: np.dtype | None = None,
samples: int = 1) -> np.ndarray:
"""Apply horizontal differencing predictor (predictor=2).

Parameters
----------
data : np.ndarray
Flat uint8 array of pixel data (modified in-place).
width, height : int
Image dimensions.
bytes_per_sample : int
Bytes per sample.

Returns
-------
np.ndarray
Same array, modified in-place.
Parameters mirror :func:`predictor_decode`.
"""
buf = np.ascontiguousarray(data)
_predictor_encode(buf, width, height, bytes_per_sample)
if bytes_per_sample == 1:
_predictor_encode_u8(buf, width * samples, height, samples)
elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
_predictor_encode_samples(buf, width, height, np.dtype(dtype),
samples=samples)
elif dtype is not None and np.dtype(dtype).kind == 'f':
# See ``predictor_decode`` for why floats stay byte-wise.
_predictor_encode_u8(buf, width * samples * bytes_per_sample,
height, samples * bytes_per_sample)
else:
_predictor_encode_u8(buf, width * bytes_per_sample, height,
bytes_per_sample)
return buf


Expand Down
102 changes: 88 additions & 14 deletions xrspatial/geotiff/_gpu_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,12 @@ def _inflate_tiles_kernel(

@cuda.jit
def _predictor_decode_kernel(data, width, height, bytes_per_sample):
"""Undo horizontal differencing (predictor=2), one thread per row."""
"""Undo horizontal differencing (predictor=2), one thread per row.

Byte-wise variant retained for 8-bit samples; multi-byte samples must
use :func:`_predictor_decode_kernel_samples` so carries between bytes
of the same sample propagate correctly.
"""
row = cuda.grid(1)
if row >= height:
return
Expand All @@ -643,6 +648,27 @@ def _predictor_decode_kernel(data, width, height, bytes_per_sample):
(numba_int32(data[idx]) + numba_int32(data[idx - bytes_per_sample])) & 0xFF)


@cuda.jit
def _predictor_decode_kernel_samples(data_view, samples_per_row, height,
stride):
"""Sample-wise horizontal-differencing decode, one thread per row.

*data_view* is a 1-D device array typed as the file's sample dtype
(e.g. uint16 / int32 / float32) so that ``+=`` between elements
preserves carries between bytes within the same sample. *stride* is
the number of typed samples between adjacent same-channel samples
(1 for single-band, samples_per_pixel for chunky multi-band) per
TIFF Tech Note 1. Operates in place.
"""
row = cuda.grid(1)
if row >= height:
return
row_start = row * samples_per_row
for col in range(stride, samples_per_row):
idx = row_start + col
data_view[idx] = data_view[idx] + data_view[idx - stride]


@cuda.jit
def _fp_predictor_decode_kernel(data, tmp, width, height, bps):
"""Undo floating-point predictor (predictor=3), one thread per row.
Expand Down Expand Up @@ -1383,11 +1409,21 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
total_rows = n_tiles * tile_height
tpb = min(256, total_rows)
bpg = math.ceil(total_rows / tpb)
# Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
# width and full per-pixel size (itemsize * samples). Matches CPU
# call at _reader.py _apply_predictor(..., bytes_per_sample * samples).
_predictor_decode_kernel[bpg, tpb](
d_decomp, tile_width, total_rows, dtype.itemsize * samples)
# TIFF predictor=2 differences whole samples (per channel). For
# 8-bit samples (and for floats, where the spec disallows
# predictor=2 but xrspatial historically wrote byte-wise diffs
# that round-trip via the same byte-wise decoder) the byte-wise
# kernel runs. Multi-byte integer samples view the buffer as the
# sample dtype so carries propagate within each sample.
sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
if not sample_wise:
_predictor_decode_kernel[bpg, tpb](
d_decomp, tile_width, total_rows, samples * dtype.itemsize)
else:
samples_per_row = tile_width * samples
view = d_decomp.view(dtype=cupy.dtype(dtype))
_predictor_decode_kernel_samples[bpg, tpb](
view, samples_per_row, total_rows, samples)
cuda.synchronize()
elif predictor == 3:
total_rows = n_tiles * tile_height
Expand Down Expand Up @@ -1629,15 +1665,22 @@ def gpu_decode_tiles(

# Apply predictor on GPU
if predictor == 2:
# Horizontal differencing: one thread per row across all tiles
# Horizontal differencing: one thread per row across all tiles.
# Multi-byte integer samples use the sample-wise kernel so carries
# propagate; 8-bit samples and floats stay byte-wise (see comment
# in _apply_predictor_and_assemble for why floats are byte-wise).
total_rows = n_tiles * tile_height
tpb = min(256, total_rows)
bpg = math.ceil(total_rows / tpb)
# Reshape so each tile's rows are contiguous (they already are).
# Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
# width and full per-pixel size (itemsize * samples).
_predictor_decode_kernel[bpg, tpb](
d_decomp, tile_width, total_rows, dtype.itemsize * samples)
sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
if not sample_wise:
_predictor_decode_kernel[bpg, tpb](
d_decomp, tile_width, total_rows, samples * dtype.itemsize)
else:
samples_per_row = tile_width * samples
view = d_decomp.view(dtype=cupy.dtype(dtype))
_predictor_decode_kernel_samples[bpg, tpb](
view, samples_per_row, total_rows, samples)
cuda.synchronize()

elif predictor == 3:
Expand Down Expand Up @@ -1723,6 +1766,8 @@ def _extract_tiles_kernel(
def _predictor_encode_kernel(data, width, height, bytes_per_sample):
"""Apply horizontal differencing (predictor=2), one thread per row.
Process right-to-left to avoid overwriting values we still need.

Byte-wise variant retained for 8-bit samples.
"""
row = cuda.grid(1)
if row >= height:
Expand All @@ -1737,6 +1782,24 @@ def _predictor_encode_kernel(data, width, height, bytes_per_sample):
(numba_int32(data[idx]) - numba_int32(data[idx - bytes_per_sample])) & 0xFF)


@cuda.jit
def _predictor_encode_kernel_samples(data_view, samples_per_row, height,
stride):
"""Sample-wise horizontal-differencing encode, one thread per row.

Counterpart to :func:`_predictor_decode_kernel_samples` -- *stride* is
the per-channel sample stride (1 for single-band, samples_per_pixel
for chunky multi-band). Operates right-to-left, in place.
"""
row = cuda.grid(1)
if row >= height:
return
row_start = row * samples_per_row
for col in range(samples_per_row - 1, stride - 1, -1):
idx = row_start + col
data_view[idx] = data_view[idx] - data_view[idx - stride]


@cuda.jit
def _fp_predictor_encode_kernel(data, tmp, width, height, bps):
"""Apply floating-point predictor (predictor=3), one thread per row."""
Expand Down Expand Up @@ -2307,8 +2370,19 @@ def gpu_compress_tiles(d_image, tile_width, tile_height,
if predictor == 2:
tpb_r = min(256, total_rows)
bpg_r = math.ceil(total_rows / tpb_r)
_predictor_encode_kernel[bpg_r, tpb_r](
d_tile_buf, tile_width * samples, total_rows, dtype.itemsize * samples)
# Sample-wise encode for multi-byte integer dtypes; byte-wise for
# 8-bit samples and floats (predictor=2 on floats is outside spec
# but xrspatial historically used byte-wise diffs, which the
# decoder still expects).
sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
if not sample_wise:
_predictor_encode_kernel[bpg_r, tpb_r](
d_tile_buf, tile_width, total_rows, samples * dtype.itemsize)
else:
samples_per_row = tile_width * samples
view = d_tile_buf.view(dtype=cupy.dtype(dtype))
_predictor_encode_kernel_samples[bpg_r, tpb_r](
view, samples_per_row, total_rows, samples)
cuda.synchronize()
elif predictor == 3:
tpb_r = min(256, total_rows)
Expand Down
Loading
Loading