From 0346ab75206d0cf916f74c9dce93ef0331c83a5e Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 8 May 2026 13:52:22 -0700
Subject: [PATCH 1/2] Parallelize per-tile compression in streaming write

The streaming tile-write path in write_streaming previously walked
each segment's tiles serially, calling _compress_block inline. The
non-streaming path at _writer.py:~568 already fans compress out to a
ThreadPoolExecutor sized at os.cpu_count(), since zlib, zstd, LZW and
LERC all release the GIL inside their C codecs.

Mirror that pattern inside the segment loop: build the tile arrays
sequentially, submit compress to a per-segment thread pool sized
min(n_seg_tiles, os.cpu_count()), then write the resulting buffers
to the file sequentially. The file write stays serial so the on-disk
tile layout is unchanged.

Memory cost is bounded by the segment size: tiles_per_segment
compressed buffers held briefly in RAM. For a 32-tile segment with
~50 KB compressed tiles that is ~1.6 MB. The streaming buffer cap
already bounds segment size, so peak memory growth is small.

Measured on a 4096x4096 float32 deflate streaming write: 1.69 s
serial-equivalent vs 0.27 s with the pool, a 6.2x speedup that
matches the audit estimate.
---
 xrspatial/geotiff/_writer.py                  |  45 +++-
 .../tests/test_streaming_write_parallel.py    | 196 ++++++++++++++++++
 2 files changed, 235 insertions(+), 6 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_streaming_write_parallel.py

diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index f06b6f33..89ba037d 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -1400,6 +1400,8 @@ def write_streaming(dask_data, path: str, *,
                                 seg_np = seg_np.copy()
                                 seg_np[nan_mask] = seg_np.dtype.type(nodata)
 
+                        # Build tile arrays for this segment
+                        seg_tile_arrs = []
                         for tc in range(seg_start, seg_end):
                             c0 = tc * tw
                             c1 = min(c0 + tw, width)
@@ -1420,17 +1422,48 @@ def write_streaming(dask_data, path: str, *,
                             else:
                                 tile_arr = np.ascontiguousarray(tile_slice)
 
-                            compressed = _compress_block(
-                                tile_arr, tw, th, samples, out_dtype,
-                                bytes_per_sample, pred_int, comp_tag,
-                                compression_level, max_z_error)
-
+                            seg_tile_arrs.append(tile_arr)
+
+                        # Parallel compress: zlib/zstd/LZW release the GIL,
+                        # so threading actually parallelises the C-level work.
+                        # Memory cost is bounded by the segment size
+                        # (tiles_per_segment compressed buffers held in RAM
+                        # before the sequential write phase below).
+                        n_seg_tiles = len(seg_tile_arrs)
+                        if n_seg_tiles <= 1:
+                            seg_compressed = [
+                                _compress_block(
+                                    ta, tw, th, samples, out_dtype,
+                                    bytes_per_sample, pred_int, comp_tag,
+                                    compression_level, max_z_error)
+                                for ta in seg_tile_arrs
+                            ]
+                        else:
+                            from concurrent.futures import (
+                                ThreadPoolExecutor)
+                            n_workers = min(n_seg_tiles,
+                                            os.cpu_count() or 4)
+                            with ThreadPoolExecutor(
+                                    max_workers=n_workers) as pool:
+                                futures = [
+                                    pool.submit(
+                                        _compress_block,
+                                        ta, tw, th, samples, out_dtype,
+                                        bytes_per_sample, pred_int, comp_tag,
+                                        compression_level, max_z_error)
+                                    for ta in seg_tile_arrs
+                                ]
+                                seg_compressed = [
+                                    fut.result() for fut in futures]
+
+                        # Sequential file write to preserve on-disk tile order
+                        for compressed in seg_compressed:
                             actual_offsets.append(current_offset)
                             actual_counts.append(len(compressed))
                             f.write(compressed)
                             current_offset += len(compressed)
 
-                        del seg_np
+                        del seg_np, seg_tile_arrs, seg_compressed
             else:
                 # Strip layout
                 for i in range(n_entries):
diff --git a/xrspatial/geotiff/tests/test_streaming_write_parallel.py b/xrspatial/geotiff/tests/test_streaming_write_parallel.py
new file mode 100644
index 00000000..833af5b6
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_streaming_write_parallel.py
@@ -0,0 +1,196 @@
+"""Parallel per-tile compression in the streaming write path (P4).
+
+The streaming tile-write path used to compress tiles serially inside each
+horizontal segment. The non-streaming path already fans compress out to a
+``ThreadPoolExecutor`` sized at ``os.cpu_count()``. These tests confirm
+the streaming path now follows the same pattern: round-trip pixels are
+unchanged, more than one worker thread participates, and a moderate
+deflate write completes within a generous wall-time budget.
+"""
+from __future__ import annotations
+
+import inspect
+import threading
+import time
+
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.geotiff import open_geotiff, to_geotiff
+from xrspatial.geotiff import _writer as writer_mod
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_dataarray(shape, dtype=np.float32, seed=20260508):
+    rng = np.random.default_rng(seed)
+    if np.issubdtype(np.dtype(dtype), np.floating):
+        arr = rng.random(shape, dtype=dtype)
+        nodata = -9999.0
+    else:
+        info = np.iinfo(dtype)
+        arr = rng.integers(info.min // 2 if info.min < 0 else 1,
+                           info.max // 2,
+                           size=shape, dtype=dtype)
+        # Pick a sentinel that is in-range for the integer dtype.
+        nodata = 0 if info.min >= 0 else -1
+    if len(shape) == 2:
+        h, w = shape
+    else:
+        h, w = shape[:2]
+    y = np.linspace(41.0, 40.0, h)
+    x = np.linspace(-106.0, -105.0, w)
+    return xr.DataArray(arr, dims=['y', 'x'][:len(shape)],
+                        coords={'y': y, 'x': x},
+                        attrs={'crs': 4326, 'nodata': nodata})
+
+
+# ---------------------------------------------------------------------------
+# Round-trip correctness across dtype/compression/tile-size combos
+# ---------------------------------------------------------------------------
+
+@pytest.mark.parametrize(
+    'dtype,compression,tile_size',
+    [
+        (np.float32, 'deflate', 256),
+        (np.float32, 'zstd', 128),
+        (np.float32, 'lzw', 256),
+        (np.uint16, 'deflate', 256),
+        (np.uint8, 'deflate', 128),
+        (np.float32, 'none', 256),
+    ],
+)
+def test_streaming_write_round_trip_unchanged(
+        dtype, compression, tile_size, tmp_path):
+    """Streaming write must be bit-exact vs. the eager write (and vs. input).
+
+    Picks shapes that force multiple horizontal segments by setting a
+    small ``streaming_buffer_bytes`` so the parallel-compress branch
+    actually fires.
+    """
+    shape = (600, 700)
+    da = _make_dataarray(shape, dtype=dtype)
+    # small chunks so dask path is taken; small buffer so segments split
+    dask_da = da.chunk({'y': 200, 'x': 200})
+
+    eager_path = str(tmp_path / 'eager.tif')
+    stream_path = str(tmp_path / 'stream.tif')
+
+    to_geotiff(da, eager_path,
+               compression=compression, tile_size=tile_size)
+
+    # Force multi-segment by limiting buffer to a couple of tile columns,
+    # if the underlying write_streaming exposes the kwarg.
+    sig = inspect.signature(writer_mod.write_streaming)
+    extra = {}
+    if 'streaming_buffer_bytes' in sig.parameters:
+        # ~3 tile columns at 256-tile widths of float32 -> few hundred KB.
+        extra['streaming_buffer_bytes'] = 256 * 1024
+    to_geotiff(dask_da, stream_path,
+               compression=compression, tile_size=tile_size, **extra)
+
+    eager = open_geotiff(eager_path)
+    stream = open_geotiff(stream_path)
+
+    # Bit-exact pixel match: streaming vs eager
+    np.testing.assert_array_equal(eager.values, stream.values)
+    # And streaming vs the source array (lossless codecs only)
+    np.testing.assert_array_equal(stream.values, da.values)
+
+
+# ---------------------------------------------------------------------------
+# Parallelism is observable: multiple thread IDs participate
+# ---------------------------------------------------------------------------
+
+def test_streaming_write_parallelism_observed(monkeypatch, tmp_path):
+    """Confirm more than one worker thread runs ``_compress_block``.
+
+    Strategy: wrap the real function so each call records the current
+    thread ID, then write a raster sized to produce many tiles inside a
+    single segment. After the write, assert at least 2 unique thread IDs
+    were observed.
+    """
+    real_compress = writer_mod._compress_block
+    seen_threads: set[int] = set()
+    lock = threading.Lock()
+
+    def recording_compress(*args, **kwargs):
+        # Hold each call long enough that the pool cannot serialise it
+        # away on a single core.
+        tid = threading.get_ident()
+        with lock:
+            seen_threads.add(tid)
+        # Tiny synthetic delay so concurrent submissions overlap.
+        time.sleep(0.005)
+        return real_compress(*args, **kwargs)
+
+    monkeypatch.setattr(writer_mod, '_compress_block', recording_compress)
+
+    # Many tiles in a single segment: 16 tile columns x 8 tile rows = 128
+    # tiles, well above any cpu count.
+    shape = (16 * 256, 16 * 256)
+    da = _make_dataarray(shape, dtype=np.float32, seed=42)
+    dask_da = da.chunk({'y': 256 * 4, 'x': 256 * 4})
+
+    path = str(tmp_path / 'parallel_check.tif')
+    to_geotiff(dask_da, path,
+               compression='deflate', tile_size=256)
+
+    assert len(seen_threads) > 1, (
+        f"Expected >1 worker threads in streaming compress, "
+        f"saw {len(seen_threads)}: {seen_threads}")
+
+
+# ---------------------------------------------------------------------------
+# Wall-clock regression guard
+# ---------------------------------------------------------------------------
+
+def test_streaming_write_perf_sanity(tmp_path):
+    """Pure regression guard. A 4096x4096 deflate streaming write should
+    finish well under 5 s on a typical dev box. Threshold is generous so
+    this is not a perf benchmark - just a tripwire if compress goes
+    serial again.
+    """
+    shape = (4096, 4096)
+    da = _make_dataarray(shape, dtype=np.float32, seed=2026)
+    dask_da = da.chunk({'y': 1024, 'x': 1024})
+
+    path = str(tmp_path / 'perf_4k.tif')
+
+    t0 = time.perf_counter()
+    to_geotiff(dask_da, path, compression='deflate', tile_size=256)
+    elapsed = time.perf_counter() - t0
+
+    # Sanity check the file was written.
+    result = open_geotiff(path)
+    assert result.shape == shape
+
+    assert elapsed < 5.0, (
+        f"Streaming 4096x4096 deflate write took {elapsed:.2f}s, "
+        f"expected <5s (regression guard)")
+
+
+# ---------------------------------------------------------------------------
+# Single-thread fallback: skipped when no public knob exists
+# ---------------------------------------------------------------------------
+
+def test_streaming_write_with_single_thread_fallback(tmp_path):
+    """If write_streaming exposes a ``threads`` kwarg, callers can opt
+    into deterministic single-thread compress. Currently it does not -
+    skip so the test stays as a placeholder for when a knob is added.
+    """
+    sig = inspect.signature(writer_mod.write_streaming)
+    if 'threads' not in sig.parameters:
+        pytest.skip(
+            "write_streaming has no 'threads' kwarg yet; skipping "
+            "deterministic single-thread fallback test")
+
+    da = _make_dataarray((400, 400), dtype=np.float32)
+    dask_da = da.chunk({'y': 200, 'x': 200})
+    path = str(tmp_path / 'threads1.tif')
+    to_geotiff(dask_da, path, compression='deflate', threads=1)
+    result = open_geotiff(path)
+    np.testing.assert_array_equal(result.values, da.values)

From 4ab4c86ec9af8a98427222b42bfe374f68001cdd Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Sat, 9 May 2026 06:15:53 -0700
Subject: [PATCH 2/2] Address Copilot review on #1531

Five findings, all acted on:

- Hoist the ThreadPoolExecutor over the entire tiled streaming write
  rather than recreating it per segment. For wide rasters with many
  horizontal segments the per-segment construction was paying the
  thread-startup cost on every stripe and offsetting the parallel
  speedup. Now one pool spans every (tile_row, segment) iteration.
- Skip the pool entirely when compression is uncompressed
  (COMPRESSION_NONE has no C-level work to release the GIL on) or
  when the host has only one usable core. Both cases fall through to
  the existing serial path.
- Update the per-segment memory-cost comment to mention BOTH the
  uncompressed seg_tile_arrs and the compressed buffers, since both
  are held simultaneously while futures resolve.
- test_streaming_write_parallelism_observed now monkeypatches
  os.cpu_count to return 4 so the assertion stays deterministic on
  single-core CI containers. Without the patch the pool would size
  to 1 and the test would fail for environment reasons.
- The wall-clock perf tripwire is gated behind XRSPATIAL_RUN_PERF_TESTS=1
  to avoid CI flakiness on shared/throttled runners; deterministic
  parallel-branch coverage already lives in the parallelism-observed
  test.
- Drop the test_streaming_write_with_single_thread_fallback placeholder.
  It gated on a `threads=` kwarg that doesn't exist and isn't planned;
  reviewing the gate-vs-call path for a non-existent kwarg adds noise
  to future readers.
---
 xrspatial/geotiff/_writer.py                  | 65 ++++++++++++-------
 .../tests/test_streaming_write_parallel.py    | 51 +++++++--------
 2 files changed, 65 insertions(+), 51 deletions(-)

diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index 89ba037d..00d875fe 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -1368,6 +1368,21 @@ def write_streaming(dask_data, path: str, *,
                     tiles_per_segment = max(
                         1, streaming_buffer_bytes // bytes_per_tile_col)
 
+                # Hoist the compression thread pool over the entire tiled
+                # write. Re-creating the executor per segment paid the
+                # thread-startup cost on every horizontal stripe and
+                # offset the parallel speedup on wide rasters; a single
+                # pool reused across all segments avoids that overhead.
+                # Skip the pool when compression is uncompressed (no
+                # C-level work to release the GIL on) or when the host
+                # has only one usable core.
+                from concurrent.futures import ThreadPoolExecutor
+                _pool_workers = min(tiles_per_segment, os.cpu_count() or 4)
+                _use_pool = (comp_tag != COMPRESSION_NONE
+                             and _pool_workers > 1)
+                tile_pool = (ThreadPoolExecutor(max_workers=_pool_workers)
+                             if _use_pool else None)
+
                 for tr in range(tiles_down):
                     r0 = tr * th
                     r1 = min(r0 + th, height)
@@ -1424,13 +1439,22 @@ def write_streaming(dask_data, path: str, *,
 
                             seg_tile_arrs.append(tile_arr)
 
-                        # Parallel compress: zlib/zstd/LZW release the GIL,
-                        # so threading actually parallelises the C-level work.
-                        # Memory cost is bounded by the segment size
-                        # (tiles_per_segment compressed buffers held in RAM
-                        # before the sequential write phase below).
+                        # Parallel compress on the hoisted ``tile_pool``
+                        # when it exists. zlib/zstd/LZW release the GIL,
+                        # so threading actually parallelises the C-level
+                        # work. Peak memory while the segment is in
+                        # flight covers BOTH the uncompressed
+                        # ``seg_tile_arrs`` (one full tile per column,
+                        # released after the futures resolve) AND the
+                        # compressed buffers ``seg_compressed`` (held
+                        # until the sequential write loop drains them).
+                        # Both lists are bounded by ``tiles_per_segment``
+                        # which the streaming buffer cap sets; fall
+                        # through to a serial path when the pool is None
+                        # (no compression / single core) or when only
+                        # one tile sits in this segment.
                         n_seg_tiles = len(seg_tile_arrs)
-                        if n_seg_tiles <= 1:
+                        if tile_pool is None or n_seg_tiles <= 1:
                             seg_compressed = [
                                 _compress_block(
                                     ta, tw, th, samples, out_dtype,
@@ -1439,22 +1463,16 @@ def write_streaming(dask_data, path: str, *,
                                 for ta in seg_tile_arrs
                             ]
                         else:
-                            from concurrent.futures import (
-                                ThreadPoolExecutor)
-                            n_workers = min(n_seg_tiles,
-                                            os.cpu_count() or 4)
-                            with ThreadPoolExecutor(
-                                    max_workers=n_workers) as pool:
-                                futures = [
-                                    pool.submit(
-                                        _compress_block,
-                                        ta, tw, th, samples, out_dtype,
-                                        bytes_per_sample, pred_int, comp_tag,
-                                        compression_level, max_z_error)
-                                    for ta in seg_tile_arrs
-                                ]
-                                seg_compressed = [
-                                    fut.result() for fut in futures]
+                            futures = [
+                                tile_pool.submit(
+                                    _compress_block,
+                                    ta, tw, th, samples, out_dtype,
+                                    bytes_per_sample, pred_int, comp_tag,
+                                    compression_level, max_z_error)
+                                for ta in seg_tile_arrs
+                            ]
+                            seg_compressed = [
+                                fut.result() for fut in futures]
 
                         # Sequential file write to preserve on-disk tile order
                         for compressed in seg_compressed:
@@ -1464,6 +1482,9 @@ def write_streaming(dask_data, path: str, *,
                             current_offset += len(compressed)
 
                         del seg_np, seg_tile_arrs, seg_compressed
+
+                if tile_pool is not None:
+                    tile_pool.shutdown(wait=True)
             else:
                 # Strip layout
                 for i in range(n_entries):
diff --git a/xrspatial/geotiff/tests/test_streaming_write_parallel.py b/xrspatial/geotiff/tests/test_streaming_write_parallel.py
index 833af5b6..978b6651 100644
--- a/xrspatial/geotiff/tests/test_streaming_write_parallel.py
+++ b/xrspatial/geotiff/tests/test_streaming_write_parallel.py
@@ -10,6 +10,7 @@
 from __future__ import annotations
 
 import inspect
+import os
 import threading
 import time
 
@@ -112,7 +113,16 @@ def test_streaming_write_parallelism_observed(monkeypatch, tmp_path):
     thread ID, then write a raster sized to produce many tiles inside a
     single segment. After the write, assert at least 2 unique thread IDs
     were observed.
+
+    Force ``os.cpu_count()`` to 4 in the global ``os`` module for the
+    duration of the test so the assertion stays deterministic on
+    single-core CI containers (where the real cpu_count would size the
+    pool to 1 and the test would fail for environment reasons). The
+    writer imports ``os`` lazily inside the function, so patching the
+    global module is sufficient.
     """
+    monkeypatch.setattr(os, 'cpu_count', lambda: 4)
+
     real_compress = writer_mod._compress_block
     seen_threads: set[int] = set()
     lock = threading.Lock()
@@ -145,15 +155,22 @@ def recording_compress(*args, **kwargs):
 
 
 # ---------------------------------------------------------------------------
-# Wall-clock regression guard
+# Optional wall-clock regression guard (env-gated)
 # ---------------------------------------------------------------------------
 
 def test_streaming_write_perf_sanity(tmp_path):
-    """Pure regression guard. A 4096x4096 deflate streaming write should
-    finish well under 5 s on a typical dev box. Threshold is generous so
-    this is not a perf benchmark - just a tripwire if compress goes
-    serial again.
+    """Opt-in tripwire: a 4096x4096 deflate streaming write under a
+    generous wall-clock threshold. Skipped by default because shared CI
+    runners, CPU throttling, debug builds, and slow filesystems all
+    make absolute timings flaky. Set ``XRSPATIAL_RUN_PERF_TESTS=1`` to
+    enable. The deterministic regression coverage lives in
+    :func:`test_streaming_write_parallelism_observed` -- this is just a
+    sanity bound on top.
     """
+    if os.environ.get('XRSPATIAL_RUN_PERF_TESTS') != '1':
+        pytest.skip(
+            "set XRSPATIAL_RUN_PERF_TESTS=1 to run wall-clock perf tests")
+
     shape = (4096, 4096)
     da = _make_dataarray(shape, dtype=np.float32, seed=2026)
     dask_da = da.chunk({'y': 1024, 'x': 1024})
@@ -164,33 +181,9 @@ def test_streaming_write_perf_sanity(tmp_path):
     to_geotiff(dask_da, path, compression='deflate', tile_size=256)
     elapsed = time.perf_counter() - t0
 
-    # Sanity check the file was written.
     result = open_geotiff(path)
     assert result.shape == shape
 
     assert elapsed < 5.0, (
         f"Streaming 4096x4096 deflate write took {elapsed:.2f}s, "
         f"expected <5s (regression guard)")
-
-
-# ---------------------------------------------------------------------------
-# Single-thread fallback: skipped when no public knob exists
-# ---------------------------------------------------------------------------
-
-def test_streaming_write_with_single_thread_fallback(tmp_path):
-    """If write_streaming exposes a ``threads`` kwarg, callers can opt
-    into deterministic single-thread compress. Currently it does not -
-    skip so the test stays as a placeholder for when a knob is added.
-    """
-    sig = inspect.signature(writer_mod.write_streaming)
-    if 'threads' not in sig.parameters:
-        pytest.skip(
-            "write_streaming has no 'threads' kwarg yet; skipping "
-            "deterministic single-thread fallback test")
-
-    da = _make_dataarray((400, 400), dtype=np.float32)
-    dask_da = da.chunk({'y': 200, 'x': 200})
-    path = str(tmp_path / 'threads1.tif')
-    to_geotiff(dask_da, path, compression='deflate', threads=1)
-    result = open_geotiff(path)
-    np.testing.assert_array_equal(result.values, da.values)