xarray-contrib · brendancol · May 6, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,12 @@
 -----------
 
 
+### Unreleased
+
+#### Bug fixes and improvements
+- Decode TIFF predictor=2 sample-wise so multi-byte integer files written by libtiff/GDAL read back exactly
+
+
 ### Version 0.9.9 - 2026-05-05
 
 #### Bug fixes and improvements

diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py
@@ -333,80 +333,164 @@ def lzw_compress(data: bytes) -> bytes:
 
 
 # -- Horizontal predictor (Numba) --------------------------------------------
+#
+# TIFF predictor=2 (horizontal differencing) operates on whole *samples*
+# (per-pixel-component values), not on individual bytes.  For multi-byte
+# integer samples the difference is computed across full sample values, so
+# carries between the low and high bytes are honoured.  The byte stream that
+# lands on disk is the difference samples re-encoded in the file's byte
+# order (always little-endian for files this writer emits, but readers must
+# also handle big-endian sources).
+#
+# The previous implementation here did byte-wise cumulative sum at a sample
+# stride, which produced the right answer for 8-bit data but silently
+# scrambled multi-byte integers whenever the difference between adjacent
+# samples crossed a byte boundary (i.e. needed a carry).  Both halves were
+# wrong in matching ways, so files this codebase wrote could be re-read,
+# but cross-tool reads of files written by libtiff/GDAL produced incorrect
+# pixel values.  See accuracy sweep finding (2026-05-06).
 
 @ngjit
-def _predictor_decode(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing predictor (TIFF predictor=2).
+def _predictor_decode_u8(data, row_len, height, stride):
+    """Undo horizontal differencing on a byte stream (in-place, mod 256).
 
-    Operates in-place on the flat byte array, performing cumulative sum
-    per row at the sample level.
+    ``row_len`` is the full row length in bytes; ``stride`` is the number
+    of bytes between adjacent same-channel samples (e.g. samples_per_pixel
+    for chunky 8-bit multi-band data).
     """
-    row_bytes = width * bytes_per_sample
     for row in range(height):
-        row_start = row * row_bytes
-        for col in range(bytes_per_sample, row_bytes):
+        row_start = row * row_len
+        for col in range(stride, row_len):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) + np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) + np.int32(data[idx - stride])) & 0xFF)
 
 
 @ngjit
-def _predictor_encode(data, width, height, bytes_per_sample):
-    """Apply horizontal differencing predictor (TIFF predictor=2).
-
-    Operates in-place, converting absolute values to differences.
-    Process right-to-left to avoid overwriting values we still need.
-    """
-    row_bytes = width * bytes_per_sample
+def _predictor_encode_u8(data, row_len, height, stride):
+    """Apply horizontal differencing on a byte stream (in-place, mod 256)."""
     for row in range(height):
-        row_start = row * row_bytes
-        for col in range(row_bytes - 1, bytes_per_sample - 1, -1):
+        row_start = row * row_len
+        for col in range(row_len - 1, stride - 1, -1):
             idx = row_start + col
-            data[idx] = np.uint8((np.int32(data[idx]) - np.int32(data[idx - bytes_per_sample])) & 0xFF)
+            data[idx] = np.uint8(
+                (np.int32(data[idx]) - np.int32(data[idx - stride])) & 0xFF)
+
+
+def _predictor_decode_samples(data, image_width, height, dtype, samples=1):
+    """Sample-wise decode for multi-byte sample dtypes.
+
+    Modifies *data* (a flat uint8 buffer) in place.  ``dtype`` carries the
+    file's byte order so that the buffer is interpreted with the right
+    endianness; the result is written back in the same byte order.
+
+    For chunky multi-band data the cumulative sum runs per-channel so that
+    each channel's value is differenced against the previous same-channel
+    pixel (TIFF Technical Note 1, predictor=2).
+    """
+    if samples == 1:
+        view = data.view(dtype).reshape(height, image_width)
+    else:
+        view = data.view(dtype).reshape(height, image_width, samples)
+    # In-place cumsum in the dtype's modular arithmetic.  The ``out`` cast
+    # keeps the result wrapping inside the sample width for fixed-width
+    # integers.  Float dtypes never reach this branch (the public
+    # ``predictor_decode`` routes them to the byte-wise legacy kernel)
+    # because predictor=2 differencing on floats does not invert exactly.
+    np.cumsum(view, axis=1, dtype=dtype, out=view)
+
+
+def _predictor_encode_samples(data, image_width, height, dtype, samples=1):
+    """Sample-wise encode for multi-byte sample dtypes (in-place)."""
+    if samples == 1:
+        view = data.view(dtype).reshape(height, image_width)
+    else:
+        view = data.view(dtype).reshape(height, image_width, samples)
+    view[:, 1:] = np.diff(view, axis=1)
 
 
 def predictor_decode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int,
+                     dtype: np.dtype | None = None,
+                     samples: int = 1) -> np.ndarray:
     """Undo horizontal differencing predictor (predictor=2).
 
     Parameters
     ----------
     data : np.ndarray
         Flat uint8 array of decompressed pixel data (modified in-place).
-    width, height : int
-        Image dimensions.
+    width : int
+        Image width in pixels.  For chunky multi-band data the number of
+        samples per row is ``width * samples``.
+    height : int
+        Number of rows.
     bytes_per_sample : int
-        Bytes per sample (e.g. 1 for uint8, 4 for float32).
+        Bytes per sample (1, 2, 4, or 8).
+    dtype : np.dtype, optional
+        Sample dtype including the file's byte order.  Required when
+        ``bytes_per_sample > 1`` so the byte stream can be interpreted
+        as whole sample values; per TIFF spec, differencing is
+        sample-level (carries propagate within each sample's bytes).
+    samples : int
+        Samples per pixel for chunky planar config.
 
     Returns
     -------
     np.ndarray
         Same array, modified in-place.
+
+    Notes
+    -----
+    Predictor=2 differences each sample against the previous same-channel
+    sample (TIFF Tech Note 1).  For chunky multi-band data the byte
+    stride between same-channel samples is ``samples * bytes_per_sample``.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_decode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        # Byte-wise math is exact for 8-bit samples; stride = channels.
+        _predictor_decode_u8(buf, width * samples, height, samples)
+    elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
+        _predictor_decode_samples(buf, width, height, np.dtype(dtype),
+                                  samples=samples)
+    elif dtype is not None and np.dtype(dtype).kind == 'f':
+        # Predictor=2 on floats is outside the TIFF spec (which restricts
+        # this predictor to integers).  xrspatial historically allowed it
+        # with byte-wise differencing, which round-trips because the byte
+        # sequence is restored exactly even though the per-sample
+        # arithmetic is meaningless.  Preserve that behaviour here so
+        # files written by older versions still read back unchanged.
+        _predictor_decode_u8(buf, width * samples * bytes_per_sample,
+                             height, samples * bytes_per_sample)
+    else:
+        # Legacy path: byte-wise with caller-supplied stride.  Only correct
+        # for round-trips with a matching byte-wise encoder; do not feed
+        # this path libtiff/GDAL files for multi-byte sample data.
+        _predictor_decode_u8(buf, width * bytes_per_sample, height,
+                             bytes_per_sample)
     return buf
 
 
 def predictor_encode(data: np.ndarray, width: int, height: int,
-                     bytes_per_sample: int) -> np.ndarray:
+                     bytes_per_sample: int,
+                     dtype: np.dtype | None = None,
+                     samples: int = 1) -> np.ndarray:
     """Apply horizontal differencing predictor (predictor=2).
 
-    Parameters
-    ----------
-    data : np.ndarray
-        Flat uint8 array of pixel data (modified in-place).
-    width, height : int
-        Image dimensions.
-    bytes_per_sample : int
-        Bytes per sample.
-
-    Returns
-    -------
-    np.ndarray
-        Same array, modified in-place.
+    Parameters mirror :func:`predictor_decode`.
     """
     buf = np.ascontiguousarray(data)
-    _predictor_encode(buf, width, height, bytes_per_sample)
+    if bytes_per_sample == 1:
+        _predictor_encode_u8(buf, width * samples, height, samples)
+    elif dtype is not None and np.dtype(dtype).kind in ('i', 'u'):
+        _predictor_encode_samples(buf, width, height, np.dtype(dtype),
+                                  samples=samples)
+    elif dtype is not None and np.dtype(dtype).kind == 'f':
+        # See ``predictor_decode`` for why floats stay byte-wise.
+        _predictor_encode_u8(buf, width * samples * bytes_per_sample,
+                             height, samples * bytes_per_sample)
+    else:
+        _predictor_encode_u8(buf, width * bytes_per_sample, height,
+                             bytes_per_sample)
     return buf
 
 

diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
@@ -629,7 +629,12 @@ def _inflate_tiles_kernel(
 
 @cuda.jit
 def _predictor_decode_kernel(data, width, height, bytes_per_sample):
-    """Undo horizontal differencing (predictor=2), one thread per row."""
+    """Undo horizontal differencing (predictor=2), one thread per row.
+
+    Byte-wise variant retained for 8-bit samples; multi-byte samples must
+    use :func:`_predictor_decode_kernel_samples` so carries between bytes
+    of the same sample propagate correctly.
+    """
     row = cuda.grid(1)
     if row >= height:
         return
@@ -643,6 +648,27 @@ def _predictor_decode_kernel(data, width, height, bytes_per_sample):
             (numba_int32(data[idx]) + numba_int32(data[idx - bytes_per_sample])) & 0xFF)
 
 
+@cuda.jit
+def _predictor_decode_kernel_samples(data_view, samples_per_row, height,
+                                     stride):
+    """Sample-wise horizontal-differencing decode, one thread per row.
+
+    *data_view* is a 1-D device array typed as the file's sample dtype
+    (e.g. uint16 / int32 / float32) so that ``+=`` between elements
+    preserves carries between bytes within the same sample.  *stride* is
+    the number of typed samples between adjacent same-channel samples
+    (1 for single-band, samples_per_pixel for chunky multi-band) per
+    TIFF Tech Note 1.  Operates in place.
+    """
+    row = cuda.grid(1)
+    if row >= height:
+        return
+    row_start = row * samples_per_row
+    for col in range(stride, samples_per_row):
+        idx = row_start + col
+        data_view[idx] = data_view[idx] + data_view[idx - stride]
+
+
 @cuda.jit
 def _fp_predictor_decode_kernel(data, tmp, width, height, bps):
     """Undo floating-point predictor (predictor=3), one thread per row.
@@ -1383,11 +1409,21 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples). Matches CPU
-        # call at _reader.py _apply_predictor(..., bytes_per_sample * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
+        # TIFF predictor=2 differences whole samples (per channel).  For
+        # 8-bit samples (and for floats, where the spec disallows
+        # predictor=2 but xrspatial historically wrote byte-wise diffs
+        # that round-trip via the same byte-wise decoder) the byte-wise
+        # kernel runs.  Multi-byte integer samples view the buffer as the
+        # sample dtype so carries propagate within each sample.
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_decode_kernel[bpg, tpb](
+                d_decomp, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_decomp.view(dtype=cupy.dtype(dtype))
+            _predictor_decode_kernel_samples[bpg, tpb](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
     elif predictor == 3:
         total_rows = n_tiles * tile_height
@@ -1629,15 +1665,22 @@ def gpu_decode_tiles(
 
     # Apply predictor on GPU
     if predictor == 2:
-        # Horizontal differencing: one thread per row across all tiles
+        # Horizontal differencing: one thread per row across all tiles.
+        # Multi-byte integer samples use the sample-wise kernel so carries
+        # propagate; 8-bit samples and floats stay byte-wise (see comment
+        # in _apply_predictor_and_assemble for why floats are byte-wise).
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
-        # Reshape so each tile's rows are contiguous (they already are).
-        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
-        # width and full per-pixel size (itemsize * samples).
-        _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_decode_kernel[bpg, tpb](
+                d_decomp, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_decomp.view(dtype=cupy.dtype(dtype))
+            _predictor_decode_kernel_samples[bpg, tpb](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
 
     elif predictor == 3:
@@ -1723,6 +1766,8 @@ def _extract_tiles_kernel(
 def _predictor_encode_kernel(data, width, height, bytes_per_sample):
     """Apply horizontal differencing (predictor=2), one thread per row.
     Process right-to-left to avoid overwriting values we still need.
+
+    Byte-wise variant retained for 8-bit samples.
     """
     row = cuda.grid(1)
     if row >= height:
@@ -1737,6 +1782,24 @@ def _predictor_encode_kernel(data, width, height, bytes_per_sample):
             (numba_int32(data[idx]) - numba_int32(data[idx - bytes_per_sample])) & 0xFF)
 
 
+@cuda.jit
+def _predictor_encode_kernel_samples(data_view, samples_per_row, height,
+                                     stride):
+    """Sample-wise horizontal-differencing encode, one thread per row.
+
+    Counterpart to :func:`_predictor_decode_kernel_samples` -- *stride* is
+    the per-channel sample stride (1 for single-band, samples_per_pixel
+    for chunky multi-band).  Operates right-to-left, in place.
+    """
+    row = cuda.grid(1)
+    if row >= height:
+        return
+    row_start = row * samples_per_row
+    for col in range(samples_per_row - 1, stride - 1, -1):
+        idx = row_start + col
+        data_view[idx] = data_view[idx] - data_view[idx - stride]
+
+
 @cuda.jit
 def _fp_predictor_encode_kernel(data, tmp, width, height, bps):
     """Apply floating-point predictor (predictor=3), one thread per row."""
@@ -2307,8 +2370,19 @@ def gpu_compress_tiles(d_image, tile_width, tile_height,
     if predictor == 2:
         tpb_r = min(256, total_rows)
         bpg_r = math.ceil(total_rows / tpb_r)
-        _predictor_encode_kernel[bpg_r, tpb_r](
-            d_tile_buf, tile_width * samples, total_rows, dtype.itemsize * samples)
+        # Sample-wise encode for multi-byte integer dtypes; byte-wise for
+        # 8-bit samples and floats (predictor=2 on floats is outside spec
+        # but xrspatial historically used byte-wise diffs, which the
+        # decoder still expects).
+        sample_wise = (dtype.itemsize > 1 and dtype.kind in ('i', 'u'))
+        if not sample_wise:
+            _predictor_encode_kernel[bpg_r, tpb_r](
+                d_tile_buf, tile_width, total_rows, samples * dtype.itemsize)
+        else:
+            samples_per_row = tile_width * samples
+            view = d_tile_buf.view(dtype=cupy.dtype(dtype))
+            _predictor_encode_kernel_samples[bpg_r, tpb_r](
+                view, samples_per_row, total_rows, samples)
         cuda.synchronize()
     elif predictor == 3:
         tpb_r = min(256, total_rows)