From a75604ac051bdfa8897e6f1941bd38bffd8f951b Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Tue, 7 Apr 2026 10:38:41 +0200
Subject: [PATCH 01/22] feat: define `PreparedWrite` and `SupportsChunkPacking`
 data structures

`PreparedWrite` models a set of per-chunk changes that would be applied to a stored chunk. `SupportsChunkPacking`
is a protocol for array -> bytes codecs that can use `PreparedWrite` objects to update an existing chunk.
---
 src/zarr/abc/codec.py             | 149 +++++++++++++++++++++++++++++-
 src/zarr/codecs/bytes.py          | 116 ++++++++++++++++++++++-
 src/zarr/core/codec_pipeline.py   |   4 +-
 tests/test_sync_codec_pipeline.py |   6 +-
 4 files changed, 266 insertions(+), 9 deletions(-)

diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
index 79c0dcf72e..17060c66d7 100644
--- a/src/zarr/abc/codec.py
+++ b/src/zarr/abc/codec.py
@@ -2,6 +2,7 @@
 
 from abc import abstractmethod
 from collections.abc import Mapping
+from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal, Protocol, TypeGuard, runtime_checkable
 
 from typing_extensions import ReadOnly, TypedDict
@@ -13,13 +14,13 @@
 
 if TYPE_CHECKING:
     from collections.abc import Awaitable, Callable, Iterable
-    from typing import Self
+    from typing import Any, Self
 
     from zarr.abc.store import ByteGetter, ByteSetter, Store
     from zarr.core.array_spec import ArraySpec
     from zarr.core.chunk_grids import ChunkGrid
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
-    from zarr.core.indexing import SelectorTuple
+    from zarr.core.indexing import ChunkProjection, SelectorTuple
     from zarr.core.metadata import ArrayMetadata
 
 __all__ = [
@@ -33,6 +34,9 @@
     "CodecOutput",
     "CodecPipeline",
     "GetResult",
+    "PreparedWrite",
+    "SupportsChunkCodec",
+    "SupportsChunkPacking",
     "SupportsSyncCodec",
 ]
 
@@ -82,6 +86,116 @@ def _decode_sync(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI: ...
     def _encode_sync(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None: ...
 
 
+class SupportsChunkCodec(Protocol):
+    """Protocol for objects that can decode/encode whole chunks synchronously.
+
+    `ChunkTransform` satisfies this protocol.
+    """
+
+    array_spec: ArraySpec
+
+    def decode_chunk(self, chunk_bytes: Buffer) -> NDBuffer: ...
+
+    def encode_chunk(self, chunk_array: NDBuffer) -> Buffer | None: ...
+
+
+class SupportsChunkPacking(Protocol):
+    """Protocol for codecs that can pack/unpack inner chunks into a storage blob
+    and manage the prepare/finalize IO lifecycle.
+
+    `BytesCodec` and `ShardingCodec` implement this protocol. The pipeline
+    uses it to separate IO (prepare/finalize) from compute (encode/decode),
+    enabling the compute phase to run in a thread pool.
+
+    The lifecycle is:
+
+    1. **Prepare**: fetch existing bytes from the store (if partial write),
+       unpack into per-inner-chunk buffers → `PreparedWrite`
+    2. **Compute**: iterate `PreparedWrite.indexer`, decode each inner chunk,
+       merge new data, re-encode, update `PreparedWrite.chunk_dict`
+    3. **Finalize**: pack `chunk_dict` back into a blob and write to store
+    """
+
+    @property
+    def inner_codec_chain(self) -> SupportsChunkCodec | None:
+        """The codec chain for inner chunks, or `None` to use the pipeline's."""
+        ...
+
+    def unpack_chunks(
+        self,
+        raw: Buffer | None,
+        chunk_spec: ArraySpec,
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Unpack a storage blob into per-inner-chunk encoded buffers."""
+        ...
+
+    def pack_chunks(
+        self,
+        chunk_dict: dict[tuple[int, ...], Buffer | None],
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        """Pack per-inner-chunk encoded buffers into a single storage blob."""
+        ...
+
+    def prepare_read_sync(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Fetch and decode a chunk synchronously, returning the selected region."""
+        ...
+
+    def prepare_write_sync(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Prepare a synchronous write: fetch existing data if needed, unpack."""
+        ...
+
+    def finalize_write_sync(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Pack the prepared chunk data and write it to the store."""
+        ...
+
+    async def prepare_read(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Async variant of `prepare_read_sync`."""
+        ...
+
+    async def prepare_write(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Async variant of `prepare_write_sync`."""
+        ...
+
+    async def finalize_write(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Async variant of `finalize_write_sync`."""
+        ...
+
+
 class BaseCodec[CI: CodecInput, CO: CodecOutput](Metadata):
     """Generic base class for codecs.
 
@@ -207,6 +321,37 @@ class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]):
     """Base class for array-to-array codecs."""
 
 
+@dataclass
+class PreparedWrite:
+    """Intermediate state between reading existing data and writing new data.
+
+    Created by `prepare_write_sync` / `prepare_write`, consumed by
+    `finalize_write_sync` / `finalize_write`. The compute phase sits
+    in between: iterate over `indexer`, decode the corresponding entry
+    in `chunk_dict`, merge new data, re-encode, and store the result
+    back into `chunk_dict`.
+
+    Attributes
+    ----------
+    chunk_dict : dict[tuple[int, ...], Buffer | None]
+        Per-inner-chunk encoded bytes, keyed by chunk coordinates.
+        For a regular array this is `{(0,): <bytes>}`. For a sharded
+        array it contains one entry per inner chunk in the shard,
+        including chunks not being modified (they pass through
+        unchanged). `None` means the chunk did not exist on disk.
+    indexer : list[ChunkProjection]
+        The inner chunks to modify. Each entry's `chunk_coords`
+        corresponds to a key in `chunk_dict`. `chunk_selection`
+        identifies the region within that inner chunk, and
+        `out_selection` identifies the corresponding region in the
+        source value array. This is a subset of `chunk_dict`'s keys
+        — untouched chunks are not listed.
+    """
+
+    chunk_dict: dict[tuple[int, ...], Buffer | None]
+    indexer: list[ChunkProjection]
+
+
 class ArrayBytesCodec(BaseCodec[NDBuffer, Buffer]):
     """Base class for array-to-bytes codecs."""
 
diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
index 86bb354fb5..1943bb0fe1 100644
--- a/src/zarr/codecs/bytes.py
+++ b/src/zarr/codecs/bytes.py
@@ -5,15 +5,16 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
-from zarr.abc.codec import ArrayBytesCodec
+from zarr.abc.codec import ArrayBytesCodec, PreparedWrite, SupportsChunkCodec
 from zarr.core.buffer import Buffer, NDBuffer
 from zarr.core.common import JSON, parse_enum, parse_named_configuration
 from zarr.core.dtype.common import HasEndianness
 
 if TYPE_CHECKING:
-    from typing import Self
+    from typing import Any, Self
 
     from zarr.core.array_spec import ArraySpec
+    from zarr.core.indexing import SelectorTuple
 
 
 class Endian(Enum):
@@ -125,3 +126,114 @@ async def _encode_single(
 
     def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         return input_byte_length
+
+    # -- SupportsChunkPacking --
+
+    @property
+    def inner_codec_chain(self) -> SupportsChunkCodec | None:
+        """Returns `None` — the pipeline should use its own codec chain."""
+        return None
+
+    def unpack_chunks(
+        self,
+        raw: Buffer | None,
+        chunk_spec: ArraySpec,
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Single chunk keyed at `(0,)`."""
+        return {(0,): raw}
+
+    def pack_chunks(
+        self,
+        chunk_dict: dict[tuple[int, ...], Buffer | None],
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        """Return the single chunk's bytes."""
+        return chunk_dict.get((0,))
+
+    def prepare_read_sync(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Fetch, decode, and return the selected region synchronously."""
+        raw = byte_getter.get_sync(prototype=codec_chain.array_spec.prototype)
+        if raw is None:
+            return None
+        chunk_array = codec_chain.decode_chunk(raw)
+        return chunk_array[chunk_selection]
+
+    def prepare_write_sync(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Fetch existing data if needed, unpack, return `PreparedWrite`."""
+        from zarr.core.indexing import ChunkProjection
+
+        existing: Buffer | None = None
+        if not replace:
+            existing = byte_setter.get_sync(prototype=codec_chain.array_spec.prototype)
+        chunk_dict = self.unpack_chunks(existing, codec_chain.array_spec)
+        indexer = [ChunkProjection((0,), chunk_selection, out_selection, replace)]  # type: ignore[arg-type]
+        return PreparedWrite(chunk_dict=chunk_dict, indexer=indexer)
+
+    def finalize_write_sync(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Pack and write to store, or delete if empty."""
+        blob = self.pack_chunks(prepared.chunk_dict, chunk_spec)
+        if blob is None:
+            byte_setter.delete_sync()
+        else:
+            byte_setter.set_sync(blob)
+
+    async def prepare_read(
+        self,
+        byte_getter: Any,
+        chunk_selection: SelectorTuple,
+        codec_chain: SupportsChunkCodec,
+    ) -> NDBuffer | None:
+        """Async variant of `prepare_read_sync`."""
+        raw = await byte_getter.get(prototype=codec_chain.array_spec.prototype)
+        if raw is None:
+            return None
+        chunk_array = codec_chain.decode_chunk(raw)
+        return chunk_array[chunk_selection]
+
+    async def prepare_write(
+        self,
+        byte_setter: Any,
+        codec_chain: SupportsChunkCodec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        replace: bool,
+    ) -> PreparedWrite:
+        """Async variant of `prepare_write_sync`."""
+        from zarr.core.indexing import ChunkProjection
+
+        existing: Buffer | None = None
+        if not replace:
+            existing = await byte_setter.get(prototype=codec_chain.array_spec.prototype)
+        chunk_dict = self.unpack_chunks(existing, codec_chain.array_spec)
+        indexer = [ChunkProjection((0,), chunk_selection, out_selection, replace)]  # type: ignore[arg-type]
+        return PreparedWrite(chunk_dict=chunk_dict, indexer=indexer)
+
+    async def finalize_write(
+        self,
+        prepared: PreparedWrite,
+        chunk_spec: ArraySpec,
+        byte_setter: Any,
+    ) -> None:
+        """Async variant of `finalize_write_sync`."""
+        blob = self.pack_chunks(prepared.chunk_dict, chunk_spec)
+        if blob is None:
+            await byte_setter.delete()
+        else:
+            await byte_setter.set(blob)
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 0edc47ff6b..f4518cb9e9 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -118,7 +118,7 @@ def __post_init__(self) -> None:
             bb_sync.append(bb_codec)
         self._bb_codecs = tuple(bb_sync)
 
-    def decode(
+    def decode_chunk(
         self,
         chunk_bytes: Buffer,
     ) -> NDBuffer:
@@ -137,7 +137,7 @@ def decode(
 
         return chunk_array
 
-    def encode(
+    def encode_chunk(
         self,
         chunk_array: NDBuffer,
     ) -> Buffer | None:
diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py
index 1bfde7c837..da0021bca8 100644
--- a/tests/test_sync_codec_pipeline.py
+++ b/tests/test_sync_codec_pipeline.py
@@ -99,9 +99,9 @@ def test_encode_decode_roundtrip(
     chain = ChunkTransform(codecs=codecs, array_spec=spec)
     nd_buf = _make_nd_buffer(arr)
 
-    encoded = chain.encode(nd_buf)
+    encoded = chain.encode_chunk(nd_buf)
     assert encoded is not None
-    decoded = chain.decode(encoded)
+    decoded = chain.decode_chunk(encoded)
     np.testing.assert_array_equal(arr, decoded.as_numpy_array())
 
 
@@ -142,4 +142,4 @@ def _encode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer
     )
     arr = np.arange(12, dtype="float64").reshape(3, 4)
     nd_buf = _make_nd_buffer(arr)
-    assert chain.encode(nd_buf) is None
+    assert chain.encode_chunk(nd_buf) is None

From 47a407f29a49842922093b55a6cc82c924289443 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Tue, 7 Apr 2026 13:57:53 +0200
Subject: [PATCH 02/22] feat: new codec pipeline that uses sync path

---
 src/zarr/abc/codec.py               |   1 +
 src/zarr/codecs/sharding.py         | 166 ++++++++++++
 src/zarr/core/codec_pipeline.py     | 388 ++++++++++++++++++++++++++++
 tests/test_phased_codec_pipeline.py | 293 +++++++++++++++++++++
 tests/test_pipeline_benchmark.py    | 163 ++++++++++++
 5 files changed, 1011 insertions(+)
 create mode 100644 tests/test_phased_codec_pipeline.py
 create mode 100644 tests/test_pipeline_benchmark.py

diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
index 17060c66d7..b250b95521 100644
--- a/src/zarr/abc/codec.py
+++ b/src/zarr/abc/codec.py
@@ -99,6 +99,7 @@ def decode_chunk(self, chunk_bytes: Buffer) -> NDBuffer: ...
     def encode_chunk(self, chunk_array: NDBuffer) -> Buffer | None: ...
 
 
+@runtime_checkable
 class SupportsChunkPacking(Protocol):
     """Protocol for codecs that can pack/unpack inner chunks into a storage blob
     and manage the prepare/finalize IO lifecycle.
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 9f26bc57b1..8b9c73be03 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -333,6 +333,12 @@ def __init__(
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
         object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec))
         object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard))
+        object.__setattr__(
+            self, "_get_inner_chunk_transform", lru_cache()(self._get_inner_chunk_transform)
+        )
+        object.__setattr__(
+            self, "_get_index_chunk_transform", lru_cache()(self._get_index_chunk_transform)
+        )
 
     # todo: typedict return type
     def __getstate__(self) -> dict[str, Any]:
@@ -349,6 +355,12 @@ def __setstate__(self, state: dict[str, Any]) -> None:
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
         object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec))
         object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard))
+        object.__setattr__(
+            self, "_get_inner_chunk_transform", lru_cache()(self._get_inner_chunk_transform)
+        )
+        object.__setattr__(
+            self, "_get_index_chunk_transform", lru_cache()(self._get_index_chunk_transform)
+        )
 
     @classmethod
     def from_dict(cls, data: dict[str, JSON]) -> Self:
@@ -403,6 +415,160 @@ def validate(
                 f"needs to be divisible by the shard's inner `chunk_shape` (got {self.chunk_shape})."
             )
 
+    def _get_inner_chunk_transform(self, shard_spec: ArraySpec) -> Any:
+        """Build a ChunkTransform for inner codecs, bound to the inner chunk spec."""
+        from zarr.core.codec_pipeline import ChunkTransform
+
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        evolved = tuple(c.evolve_from_array_spec(array_spec=chunk_spec) for c in self.codecs)
+        return ChunkTransform(codecs=evolved, array_spec=chunk_spec)
+
+    def _get_index_chunk_transform(self, chunks_per_shard: tuple[int, ...]) -> Any:
+        """Build a ChunkTransform for index codecs."""
+        from zarr.core.codec_pipeline import ChunkTransform
+
+        index_spec = self._get_index_chunk_spec(chunks_per_shard)
+        evolved = tuple(c.evolve_from_array_spec(array_spec=index_spec) for c in self.index_codecs)
+        return ChunkTransform(codecs=evolved, array_spec=index_spec)
+
+    def _decode_shard_index_sync(
+        self, index_bytes: Buffer, chunks_per_shard: tuple[int, ...]
+    ) -> _ShardIndex:
+        """Decode shard index synchronously using ChunkTransform."""
+        index_transform = self._get_index_chunk_transform(chunks_per_shard)
+        index_array = index_transform.decode_chunk(index_bytes)
+        return _ShardIndex(index_array.as_numpy_array())
+
+    def _encode_shard_index_sync(self, index: _ShardIndex) -> Buffer:
+        """Encode shard index synchronously using ChunkTransform."""
+        index_transform = self._get_index_chunk_transform(index.chunks_per_shard)
+        index_nd = get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths)
+        result = index_transform.encode_chunk(index_nd)
+        assert result is not None
+        return result
+
+    def _shard_reader_from_bytes_sync(
+        self, buf: Buffer, chunks_per_shard: tuple[int, ...]
+    ) -> _ShardReader:
+        """Sync version of _ShardReader.from_bytes."""
+        shard_index_size = self._shard_index_size(chunks_per_shard)
+        if self.index_location == ShardingCodecIndexLocation.start:
+            shard_index_bytes = buf[:shard_index_size]
+        else:
+            shard_index_bytes = buf[-shard_index_size:]
+        index = self._decode_shard_index_sync(shard_index_bytes, chunks_per_shard)
+        reader = _ShardReader()
+        reader.buf = buf
+        reader.index = index
+        return reader
+
+    def _decode_sync(
+        self,
+        shard_bytes: Buffer,
+        shard_spec: ArraySpec,
+    ) -> NDBuffer:
+        """Decode a full shard synchronously."""
+        shard_shape = shard_spec.shape
+        chunk_shape = self.chunk_shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = BasicIndexer(
+            tuple(slice(0, s) for s in shard_shape),
+            shape=shard_shape,
+            chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape),
+        )
+
+        out = chunk_spec.prototype.nd_buffer.empty(
+            shape=shard_shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        shard_dict = self._shard_reader_from_bytes_sync(shard_bytes, chunks_per_shard)
+
+        if shard_dict.index.is_all_empty():
+            out.fill(shard_spec.fill_value)
+            return out
+
+        for chunk_coords, chunk_selection, out_selection, _ in indexer:
+            try:
+                chunk_bytes = shard_dict[chunk_coords]
+            except KeyError:
+                out[out_selection] = shard_spec.fill_value
+                continue
+            chunk_array = inner_transform.decode_chunk(chunk_bytes)
+            out[out_selection] = chunk_array[chunk_selection]
+
+        return out
+
+    def _encode_sync(
+        self,
+        shard_array: NDBuffer,
+        shard_spec: ArraySpec,
+    ) -> Buffer | None:
+        """Encode a full shard synchronously."""
+        shard_shape = shard_spec.shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = BasicIndexer(
+            tuple(slice(0, s) for s in shard_shape),
+            shape=shard_shape,
+            chunk_grid=RegularChunkGrid(chunk_shape=self.chunk_shape),
+        )
+
+        shard_builder: dict[tuple[int, ...], Buffer | None] = dict.fromkeys(
+            morton_order_iter(chunks_per_shard)
+        )
+
+        for chunk_coords, chunk_selection, out_selection, _ in indexer:
+            chunk_array = shard_array[out_selection]
+            encoded = inner_transform.encode_chunk(chunk_array)
+            shard_builder[chunk_coords] = encoded
+
+        return self._encode_shard_dict_sync(
+            shard_builder,
+            chunks_per_shard=chunks_per_shard,
+            buffer_prototype=default_buffer_prototype(),
+        )
+
+    def _encode_shard_dict_sync(
+        self,
+        shard_dict: ShardMapping,
+        chunks_per_shard: tuple[int, ...],
+        buffer_prototype: BufferPrototype,
+    ) -> Buffer | None:
+        """Sync version of _encode_shard_dict."""
+        index = _ShardIndex.create_empty(chunks_per_shard)
+        buffers = []
+        template = buffer_prototype.buffer.create_zero_length()
+        chunk_start = 0
+
+        for chunk_coords in morton_order_iter(chunks_per_shard):
+            value = shard_dict.get(chunk_coords)
+            if value is None or len(value) == 0:
+                continue
+            chunk_length = len(value)
+            buffers.append(value)
+            index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length))
+            chunk_start += chunk_length
+
+        if len(buffers) == 0:
+            return None
+
+        index_bytes = self._encode_shard_index_sync(index)
+        if self.index_location == ShardingCodecIndexLocation.start:
+            empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64
+            index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes)
+            index_bytes = self._encode_shard_index_sync(index)
+            buffers.insert(0, index_bytes)
+        else:
+            buffers.append(index_bytes)
+
+        return template.combine(buffers)
+
     async def _decode_single(
         self,
         shard_bytes: Buffer,
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index f4518cb9e9..33048d27fd 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from itertools import islice, pairwise
 from typing import TYPE_CHECKING, Any
@@ -679,3 +680,390 @@ def codecs_from_list(
 
 
 register_pipeline(BatchedCodecPipeline)
+
+
+@dataclass(frozen=True)
+class PhasedCodecPipeline(CodecPipeline):
+    """Codec pipeline using the three-phase prepare/compute/finalize pattern.
+
+    Separates IO (prepare, finalize) from compute (encode, decode) so that
+    the compute phase can run without holding IO resources. This is the
+    foundation for thread-pool-based parallelism.
+
+    Works with any ``ArrayBytesCodec``. The sync path (``read_sync`` /
+    ``write_sync``) requires ``SupportsChunkPacking`` and ``SupportsSyncCodec``.
+    """
+
+    codecs: tuple[Codec, ...]
+    chunk_transform: ChunkTransform | None
+    batch_size: int
+
+    @classmethod
+    def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
+        codec_list = tuple(codecs)
+        codecs_from_list(codec_list)  # validate codec ordering
+
+        if batch_size is None:
+            batch_size = config.get("codec_pipeline.batch_size")
+
+        return cls(
+            codecs=codec_list,
+            chunk_transform=None,
+            batch_size=batch_size,
+        )
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=array_spec) for c in self.codecs)
+        # Only create ChunkTransform if all codecs support sync
+        all_sync = all(isinstance(c, SupportsSyncCodec) for c in evolved_codecs)
+        chunk_transform = ChunkTransform(codecs=evolved_codecs, array_spec=array_spec) if all_sync else None
+        return type(self)(
+            codecs=evolved_codecs,
+            chunk_transform=chunk_transform,
+            batch_size=self.batch_size,
+        )
+
+    def __iter__(self) -> Iterator[Codec]:
+        return iter(self.codecs)
+
+    @property
+    def supports_partial_decode(self) -> bool:
+        ab = self._ab_codec
+        return isinstance(ab, ArrayBytesCodecPartialDecodeMixin)
+
+    @property
+    def supports_partial_encode(self) -> bool:
+        ab = self._ab_codec
+        return isinstance(ab, ArrayBytesCodecPartialEncodeMixin)
+
+    def validate(
+        self, *, shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid
+    ) -> None:
+        for codec in self.codecs:
+            codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid)
+
+    def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
+        if self.chunk_transform is not None:
+            return self.chunk_transform.compute_encoded_size(byte_length, array_spec)
+        return byte_length
+
+    async def decode(
+        self,
+        chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
+    ) -> Iterable[NDBuffer | None]:
+        """Decode a batch of chunks through the full codec chain."""
+        aa, ab, bb = codecs_from_list(self.codecs)
+        chunk_bytes_batch: Iterable[Buffer | None]
+        chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs)
+
+        for bb_codec in bb[::-1]:
+            chunk_bytes_batch = await bb_codec.decode(
+                zip(chunk_bytes_batch, chunk_specs, strict=False)
+            )
+        chunk_array_batch = await ab.decode(
+            zip(chunk_bytes_batch, chunk_specs, strict=False)
+        )
+        for aa_codec in aa[::-1]:
+            chunk_array_batch = await aa_codec.decode(
+                zip(chunk_array_batch, chunk_specs, strict=False)
+            )
+        return chunk_array_batch
+
+    async def encode(
+        self,
+        chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]],
+    ) -> Iterable[Buffer | None]:
+        """Encode a batch of chunks through the full codec chain."""
+        aa, ab, bb = codecs_from_list(self.codecs)
+        chunk_array_batch: Iterable[NDBuffer | None]
+        chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs)
+
+        for aa_codec in aa:
+            chunk_array_batch = await aa_codec.encode(
+                zip(chunk_array_batch, chunk_specs, strict=False)
+            )
+        chunk_bytes_batch = await ab.encode(
+            zip(chunk_array_batch, chunk_specs, strict=False)
+        )
+        for bb_codec in bb:
+            chunk_bytes_batch = await bb_codec.encode(
+                zip(chunk_bytes_batch, chunk_specs, strict=False)
+            )
+        return chunk_bytes_batch
+
+    @property
+    def _ab_codec(self) -> ArrayBytesCodec:
+        _, ab, _ = codecs_from_list(self.codecs)
+        return ab
+
+    # -- Phase 2: pure compute (no IO) --
+
+    def _transform_read(
+        self,
+        raw: Buffer | None,
+        _chunk_spec: ArraySpec,
+    ) -> NDBuffer | None:
+        """Decode raw bytes into an array. Pure sync compute, no IO.
+
+        Requires ``chunk_transform`` (all codecs must support sync).
+        Raises ``RuntimeError`` if called without a chunk transform.
+        """
+        if raw is None:
+            return None
+        if self.chunk_transform is None:
+            raise RuntimeError(
+                "Cannot call _transform_read without a ChunkTransform. "
+                "All codecs must implement SupportsSyncCodec for sync compute."
+            )
+        return self.chunk_transform.decode_chunk(raw)
+
+    def _transform_write(
+        self,
+        existing: Buffer | None,
+        chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        value: NDBuffer,
+        drop_axes: tuple[int, ...],
+    ) -> Buffer | None:
+        """Decode existing, merge new data, re-encode. Pure sync compute, no IO.
+
+        Requires ``chunk_transform`` (all codecs must support sync).
+        Raises ``RuntimeError`` if called without a chunk transform.
+        """
+        if self.chunk_transform is None:
+            raise RuntimeError(
+                "Cannot call _transform_write without a ChunkTransform. "
+                "All codecs must implement SupportsSyncCodec for sync compute."
+            )
+
+        if existing is not None:
+            chunk_array: NDBuffer | None = self.chunk_transform.decode_chunk(existing)
+        else:
+            chunk_array = None
+
+        if chunk_array is None:
+            chunk_array = chunk_spec.prototype.nd_buffer.create(
+                shape=chunk_spec.shape,
+                dtype=chunk_spec.dtype.to_native_dtype(),
+                fill_value=fill_value_or_default(chunk_spec),
+            )
+
+        # Merge new data
+        if drop_axes:
+            chunk_value = value[out_selection]
+            chunk_array[chunk_selection] = chunk_value.squeeze(axis=drop_axes)
+        else:
+            chunk_array[chunk_selection] = value[out_selection]
+
+        return self.chunk_transform.encode_chunk(chunk_array)
+
+    # -- Phase 3: scatter (read) / store (write) --
+
+    @staticmethod
+    def _scatter(
+        batch: list[tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        decoded: list[NDBuffer | None],
+        out: NDBuffer,
+        drop_axes: tuple[int, ...],
+    ) -> tuple[GetResult, ...]:
+        """Write decoded chunk arrays into the output buffer."""
+        results: list[GetResult] = []
+        for (_, chunk_spec, chunk_selection, out_selection, _), chunk_array in zip(
+            batch, decoded, strict=True
+        ):
+            if chunk_array is not None:
+                selected = chunk_array[chunk_selection]
+                if drop_axes:
+                    selected = selected.squeeze(axis=drop_axes)
+                out[out_selection] = selected
+                results.append(GetResult(status="present"))
+            else:
+                out[out_selection] = fill_value_or_default(chunk_spec)
+                results.append(GetResult(status="missing"))
+        return tuple(results)
+
+    # -- Async API --
+
+    async def read(
+        self,
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        out: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+    ) -> tuple[GetResult, ...]:
+        batch = list(batch_info)
+        if not batch:
+            return ()
+
+        # Phase 1: IO — fetch all raw bytes concurrently
+        raw_buffers: list[Buffer | None] = await concurrent_map(
+            [(bg, cs.prototype) for bg, cs, *_ in batch],
+            lambda bg, proto: bg.get(prototype=proto),
+            config.get("async.concurrency"),
+        )
+
+        # Phase 2: compute — decode all chunks
+        if self.chunk_transform is not None:
+            # All codecs support sync — offload to threads for parallelism
+            import asyncio
+
+            decoded: list[NDBuffer | None] = list(await asyncio.gather(*[
+                asyncio.to_thread(self._transform_read, raw, cs)
+                for raw, (_, cs, *_) in zip(raw_buffers, batch, strict=True)
+            ]))
+        else:
+            # Some codecs are async-only — decode inline (no threading, no deadlock)
+            decoded = list(await self.decode(
+                zip(raw_buffers, [cs for _, cs, *_ in batch], strict=False)
+            ))
+
+        # Phase 3: scatter
+        return self._scatter(batch, decoded, out, drop_axes)
+
+    async def write(
+        self,
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        value: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+    ) -> None:
+        batch = list(batch_info)
+        if not batch:
+            return
+
+        # Phase 1: IO — fetch existing bytes concurrently (skip for complete writes)
+        async def _fetch_existing(
+            byte_setter: ByteSetter, chunk_spec: ArraySpec, is_complete: bool
+        ) -> Buffer | None:
+            if is_complete:
+                return None
+            return await byte_setter.get(prototype=chunk_spec.prototype)
+
+        existing_buffers: list[Buffer | None] = await concurrent_map(
+            [(bs, cs, ic) for bs, cs, _, _, ic in batch],
+            _fetch_existing,
+            config.get("async.concurrency"),
+        )
+
+        # Phase 2: compute — decode, merge, re-encode
+        if self.chunk_transform is not None:
+            # All codecs support sync — offload to threads for parallelism
+            import asyncio
+
+            blobs: list[Buffer | None] = list(await asyncio.gather(*[
+                asyncio.to_thread(
+                    self._transform_write, existing, cs, csel, osel, value, drop_axes
+                )
+                for existing, (_, cs, csel, osel, _) in zip(
+                    existing_buffers, batch, strict=True
+                )
+            ]))
+        else:
+            # Some codecs are async-only — encode inline (no threading, no deadlock)
+            blobs = []
+            for existing, (_, cs, csel, osel, _) in zip(
+                existing_buffers, batch, strict=True
+            ):
+                if existing is not None:
+                    chunk_array_batch = await self.decode([(existing, cs)])
+                    chunk_array = next(iter(chunk_array_batch))
+                else:
+                    chunk_array = None
+
+                if chunk_array is None:
+                    chunk_array = cs.prototype.nd_buffer.create(
+                        shape=cs.shape,
+                        dtype=cs.dtype.to_native_dtype(),
+                        fill_value=fill_value_or_default(cs),
+                    )
+
+                if drop_axes:
+                    chunk_value = value[osel]
+                    chunk_array[csel] = chunk_value.squeeze(axis=drop_axes)
+                else:
+                    chunk_array[csel] = value[osel]
+
+                encoded_batch = await self.encode([(chunk_array, cs)])
+                blobs.append(next(iter(encoded_batch)))
+
+        # Phase 3: IO — write results concurrently
+        async def _store_one(byte_setter: ByteSetter, blob: Buffer | None) -> None:
+            if blob is None:
+                await byte_setter.delete()
+            else:
+                await byte_setter.set(blob)
+
+        await concurrent_map(
+            [(bs, blob) for (bs, *_), blob in zip(batch, blobs, strict=True)],
+            _store_one,
+            config.get("async.concurrency"),
+        )
+
+    # -- Sync API --
+
+    def read_sync(
+        self,
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        out: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+        n_workers: int = 0,
+    ) -> None:
+        """Synchronous read. Same three phases as async, different IO wrapper."""
+        batch = list(batch_info)
+        if not batch:
+            return
+
+        # Phase 1: IO — fetch all raw bytes serially
+        raw_buffers: list[Buffer | None] = [
+            bg.get_sync(prototype=cs.prototype) for bg, cs, *_ in batch
+        ]
+
+        # Phase 2: compute — decode (optionally threaded)
+        specs = [cs for _, cs, *_ in batch]
+        if n_workers > 0 and len(batch) > 1:
+            with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                decoded = list(pool.map(self._transform_read, raw_buffers, specs))
+        else:
+            decoded = [
+                self._transform_read(raw, cs)
+                for raw, cs in zip(raw_buffers, specs, strict=True)
+            ]
+
+        # Phase 3: scatter
+        self._scatter(batch, decoded, out, drop_axes)
+
+    def write_sync(
+        self,
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        value: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+        n_workers: int = 0,
+    ) -> None:
+        """Synchronous write. Same three phases as async, different IO wrapper."""
+        batch = list(batch_info)
+        if not batch:
+            return
+
+        # Phase 1: IO — fetch existing bytes serially
+        existing_buffers: list[Buffer | None] = [
+            None if ic else bs.get_sync(prototype=cs.prototype)
+            for bs, cs, _, _, ic in batch
+        ]
+
+        # Phase 2: compute — decode, merge, re-encode (optionally threaded)
+        def _compute(idx: int) -> Buffer | None:
+            _, cs, csel, osel, _ = batch[idx]
+            return self._transform_write(existing_buffers[idx], cs, csel, osel, value, drop_axes)
+
+        indices = list(range(len(batch)))
+        if n_workers > 0 and len(batch) > 1:
+            with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                blobs: list[Buffer | None] = list(pool.map(_compute, indices))
+        else:
+            blobs = [_compute(i) for i in indices]
+
+        # Phase 3: IO — write results serially
+        for (bs, *_), blob in zip(batch, blobs, strict=True):
+            if blob is None:
+                bs.delete_sync()
+            else:
+                bs.set_sync(blob)
diff --git a/tests/test_phased_codec_pipeline.py b/tests/test_phased_codec_pipeline.py
new file mode 100644
index 0000000000..2b81787858
--- /dev/null
+++ b/tests/test_phased_codec_pipeline.py
@@ -0,0 +1,293 @@
+"""Tests for PhasedCodecPipeline — the three-phase prepare/compute/finalize pipeline."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import pytest
+
+import zarr
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.zstd import ZstdCodec
+from zarr.core.codec_pipeline import PhasedCodecPipeline
+from zarr.storage import MemoryStore, StorePath
+
+
+def _create_array(
+    shape: tuple[int, ...],
+    dtype: str = "float64",
+    chunks: tuple[int, ...] | None = None,
+    codecs: tuple[Any, ...] = (BytesCodec(),),
+    fill_value: object = 0,
+) -> zarr.Array:
+    """Create a zarr array using PhasedCodecPipeline."""
+    if chunks is None:
+        chunks = shape
+
+    pipeline = PhasedCodecPipeline.from_codecs(codecs)
+
+    return zarr.create_array(
+        StorePath(MemoryStore()),
+        shape=shape,
+        dtype=dtype,
+        chunks=chunks,
+        filters=[c for c in codecs if not isinstance(c, BytesCodec)],
+        serializer=BytesCodec() if any(isinstance(c, BytesCodec) for c in codecs) else "auto",
+        compressors=None,
+        fill_value=fill_value,
+    )
+
+
+@pytest.mark.parametrize(
+    "codecs",
+    [
+        (BytesCodec(),),
+        (BytesCodec(), GzipCodec(level=1)),
+        (BytesCodec(), ZstdCodec(level=1)),
+        (TransposeCodec(order=(1, 0)), BytesCodec()),
+        (TransposeCodec(order=(1, 0)), BytesCodec(), ZstdCodec(level=1)),
+    ],
+    ids=["bytes-only", "gzip", "zstd", "transpose", "transpose+zstd"],
+)
+def test_construction(codecs: tuple[Any, ...]) -> None:
+    """PhasedCodecPipeline can be constructed from valid codec combinations."""
+    pipeline = PhasedCodecPipeline.from_codecs(codecs)
+    assert pipeline.codecs == codecs
+
+
+def test_evolve_from_array_spec() -> None:
+    """evolve_from_array_spec creates a ChunkTransform."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
+    assert pipeline.chunk_transform is None
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(100,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+    evolved = pipeline.evolve_from_array_spec(spec)
+    assert evolved.chunk_transform is not None
+
+
+@pytest.mark.parametrize(
+    ("dtype", "shape"),
+    [
+        ("float64", (100,)),
+        ("float32", (50,)),
+        ("int32", (200,)),
+        ("float64", (10, 10)),
+    ],
+    ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
+)
+async def test_read_write_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
+    """Data written through PhasedCodecPipeline can be read back correctly."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    spec = ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    # Write
+    data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_selection = tuple(slice(0, s) for s in shape)
+    out_selection = chunk_selection
+
+    store_path = StorePath(store, "c/0")
+    await pipeline.write(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        value,
+    )
+
+    # Read
+    out = CPUNDBuffer.from_numpy_array(np.zeros(shape, dtype=dtype))
+    await pipeline.read(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(data, out.as_numpy_array())
+
+
+async def test_read_missing_chunk_fills() -> None:
+    """Reading a missing chunk fills with the fill value."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(10,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(42.0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
+    store_path = StorePath(store, "c/0")
+    chunk_sel = (slice(0, 10),)
+
+    await pipeline.read(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(out.as_numpy_array(), np.full(10, 42.0))
+
+
+# ---------------------------------------------------------------------------
+# Sync path tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    ("dtype", "shape"),
+    [
+        ("float64", (100,)),
+        ("float32", (50,)),
+        ("int32", (200,)),
+        ("float64", (10, 10)),
+    ],
+    ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
+)
+def test_read_write_sync_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
+    """Data written via write_sync can be read back via read_sync."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    spec = ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_selection = tuple(slice(0, s) for s in shape)
+    out_selection = chunk_selection
+    store_path = StorePath(store, "c/0")
+
+    # Write sync
+    pipeline.write_sync(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        value,
+    )
+
+    # Read sync
+    out = CPUNDBuffer.from_numpy_array(np.zeros(shape, dtype=dtype))
+    pipeline.read_sync(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(data, out.as_numpy_array())
+
+
+def test_read_sync_missing_chunk_fills() -> None:
+    """Sync read of a missing chunk fills with the fill value."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(10,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(42.0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
+    store_path = StorePath(store, "c/0")
+    chunk_sel = (slice(0, 10),)
+
+    pipeline.read_sync(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(out.as_numpy_array(), np.full(10, 42.0))
+
+
+async def test_sync_write_async_read_roundtrip() -> None:
+    """Data written via write_sync can be read back via async read."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(100,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    data = np.arange(100, dtype="float64")
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_sel = (slice(0, 100),)
+    store_path = StorePath(store, "c/0")
+
+    # Write sync
+    pipeline.write_sync(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        value,
+    )
+
+    # Read async
+    out = CPUNDBuffer.from_numpy_array(np.zeros(100, dtype="float64"))
+    await pipeline.read(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(data, out.as_numpy_array())
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
new file mode 100644
index 0000000000..8eaeff7989
--- /dev/null
+++ b/tests/test_pipeline_benchmark.py
@@ -0,0 +1,163 @@
+"""Benchmark comparing BatchedCodecPipeline vs PhasedCodecPipeline.
+
+Run with: hatch run test.py3.12-minimal:pytest tests/test_pipeline_benchmark.py -v --benchmark-enable
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any
+
+import numpy as np
+import pytest
+
+from zarr.abc.codec import Codec
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.sharding import ShardingCodec
+from zarr.core.array_spec import ArrayConfig, ArraySpec
+from zarr.core.buffer import default_buffer_prototype
+from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+from zarr.core.codec_pipeline import BatchedCodecPipeline, PhasedCodecPipeline
+from zarr.core.dtype import get_data_type_from_native_dtype
+from zarr.core.sync import sync
+from zarr.storage import MemoryStore, StorePath
+
+
+class PipelineKind(Enum):
+    batched = "batched"
+    phased_async = "phased_async"
+    phased_sync = "phased_sync"
+    phased_sync_threaded = "phased_sync_threaded"
+
+
+# 1 MB of float64 = 131072 elements
+CHUNK_ELEMENTS = 1024 * 1024 // 8
+CHUNK_SHAPE = (CHUNK_ELEMENTS,)
+
+
+def _make_spec(shape: tuple[int, ...], dtype: str = "float64") -> ArraySpec:
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    return ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+
+def _build_codecs(
+    compressor: str,
+    serializer: str,
+) -> tuple[Codec, ...]:
+    """Build a codec tuple from human-readable compressor/serializer names."""
+    bb: tuple[Codec, ...] = ()
+    if compressor == "gzip":
+        bb = (GzipCodec(level=1),)
+
+    if serializer == "sharding":
+        # 4 inner chunks per shard
+        inner_chunk = (CHUNK_ELEMENTS // 4,)
+        inner_codecs: list[Codec] = [BytesCodec()]
+        if bb:
+            inner_codecs.extend(bb)
+        return (ShardingCodec(chunk_shape=inner_chunk, codecs=inner_codecs),)
+    else:
+        return (BytesCodec(), *bb)
+
+
+def _make_pipeline(
+    kind: PipelineKind,
+    codecs: tuple[Codec, ...],
+    spec: ArraySpec,
+) -> BatchedCodecPipeline | PhasedCodecPipeline:
+    if kind == PipelineKind.batched:
+        pipeline = BatchedCodecPipeline.from_codecs(codecs)
+        # Work around generator-consumption bug in codecs_from_list
+        evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=spec) for c in pipeline)
+        return BatchedCodecPipeline.from_codecs(evolved_codecs)
+    else:  # phased_async, phased_sync, phased_sync_threaded
+        pipeline = PhasedCodecPipeline.from_codecs(codecs)
+        return pipeline.evolve_from_array_spec(spec)
+
+
+def _write_and_read(
+    pipeline: BatchedCodecPipeline | PhasedCodecPipeline,
+    store: MemoryStore,
+    spec: ArraySpec,
+    data: np.ndarray[Any, np.dtype[Any]],
+    kind: PipelineKind,
+    n_chunks: int = 1,
+) -> None:
+    """Write data as n_chunks, then read it all back."""
+    chunk_size = data.shape[0] // n_chunks
+    chunk_shape = (chunk_size,)
+    chunk_spec = _make_spec(chunk_shape, dtype=str(data.dtype))
+
+    # Build batch info for all chunks
+    write_batch: list[tuple[Any, ...]] = []
+    for i in range(n_chunks):
+        store_path = StorePath(store, f"c/{i}")
+        chunk_sel = (slice(0, chunk_size),)
+        out_sel = (slice(i * chunk_size, (i + 1) * chunk_size),)
+        write_batch.append((store_path, chunk_spec, chunk_sel, out_sel, True))
+
+    value = CPUNDBuffer.from_numpy_array(data)
+
+    if kind == PipelineKind.phased_sync:
+        assert isinstance(pipeline, PhasedCodecPipeline)
+        pipeline.write_sync(write_batch, value)
+        out = CPUNDBuffer.from_numpy_array(np.empty_like(data))
+        pipeline.read_sync(write_batch, out)
+    elif kind == PipelineKind.phased_sync_threaded:
+        assert isinstance(pipeline, PhasedCodecPipeline)
+        pipeline.write_sync(write_batch, value, n_workers=4)
+        out = CPUNDBuffer.from_numpy_array(np.empty_like(data))
+        pipeline.read_sync(write_batch, out, n_workers=4)
+    else:
+        sync(pipeline.write(write_batch, value))
+        out = CPUNDBuffer.from_numpy_array(np.empty_like(data))
+        sync(pipeline.read(write_batch, out))
+
+
+@pytest.mark.benchmark(group="pipeline")
+@pytest.mark.parametrize(
+    "kind",
+    [
+        PipelineKind.batched,
+        PipelineKind.phased_async,
+        PipelineKind.phased_sync,
+        PipelineKind.phased_sync_threaded,
+    ],
+    ids=["batched", "phased-async", "phased-sync", "phased-sync-threaded"],
+)
+@pytest.mark.parametrize("compressor", ["none", "gzip"], ids=["no-compress", "gzip"])
+@pytest.mark.parametrize("serializer", ["bytes", "sharding"], ids=["bytes", "sharding"])
+@pytest.mark.parametrize("n_chunks", [1, 8], ids=["1chunk", "8chunks"])
+def test_pipeline(
+    benchmark: Any,
+    kind: PipelineKind,
+    compressor: str,
+    serializer: str,
+    n_chunks: int,
+) -> None:
+    """1 MB per chunk, parametrized over pipeline, compressor, serializer, and chunk count."""
+    codecs = _build_codecs(compressor, serializer)
+
+    # Sync paths require SupportsChunkPacking for the BytesCodec-level IO
+    # ShardingCodec now has _decode_sync/_encode_sync but not SupportsChunkPacking
+    if serializer == "sharding" and kind in (PipelineKind.phased_sync, PipelineKind.phased_sync_threaded):
+        pytest.skip("Sync IO path not yet implemented for ShardingCodec")
+
+    # Threading only helps with multiple chunks
+    if kind == PipelineKind.phased_sync_threaded and n_chunks == 1:
+        pytest.skip("Threading with 1 chunk has no benefit")
+
+    total_elements = CHUNK_ELEMENTS * n_chunks
+    spec = _make_spec((total_elements,))
+    data = np.random.default_rng(42).random(total_elements)
+    store = MemoryStore()
+    pipeline = _make_pipeline(kind, codecs, _make_spec(CHUNK_SHAPE))
+
+    benchmark(_write_and_read, pipeline, store, spec, data, kind, n_chunks)

From 3c27e4948c61358a17932f44db01712622f14f6b Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Wed, 8 Apr 2026 15:19:11 +0200
Subject: [PATCH 03/22] feat: complete second codecpipeline

---
 src/zarr/abc/codec.py               |  37 +-
 src/zarr/codecs/bytes.py            |   2 +-
 src/zarr/codecs/sharding.py         |   7 +-
 src/zarr/core/array.py              |  12 +-
 src/zarr/core/codec_pipeline.py     | 764 +++++++++++++++++++++++-----
 tests/test_phased_codec_pipeline.py |   4 +-
 tests/test_pipeline_benchmark.py    |  17 +-
 7 files changed, 682 insertions(+), 161 deletions(-)

diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
index b250b95521..d456210996 100644
--- a/src/zarr/abc/codec.py
+++ b/src/zarr/abc/codec.py
@@ -36,7 +36,7 @@
     "GetResult",
     "PreparedWrite",
     "SupportsChunkCodec",
-    "SupportsChunkPacking",
+    "SupportsChunkMapping",
     "SupportsSyncCodec",
 ]
 
@@ -100,21 +100,26 @@ def encode_chunk(self, chunk_array: NDBuffer) -> Buffer | None: ...
 
 
 @runtime_checkable
-class SupportsChunkPacking(Protocol):
-    """Protocol for codecs that can pack/unpack inner chunks into a storage blob
-    and manage the prepare/finalize IO lifecycle.
-
-    `BytesCodec` and `ShardingCodec` implement this protocol. The pipeline
-    uses it to separate IO (prepare/finalize) from compute (encode/decode),
-    enabling the compute phase to run in a thread pool.
-
-    The lifecycle is:
-
-    1. **Prepare**: fetch existing bytes from the store (if partial write),
-       unpack into per-inner-chunk buffers → `PreparedWrite`
-    2. **Compute**: iterate `PreparedWrite.indexer`, decode each inner chunk,
-       merge new data, re-encode, update `PreparedWrite.chunk_dict`
-    3. **Finalize**: pack `chunk_dict` back into a blob and write to store
+class SupportsChunkMapping(Protocol):
+    """Protocol for codecs that expose their stored data as a mapping
+    from chunk coordinates to encoded buffers.
+
+    A single store key holds a blob. This protocol defines how to
+    interpret that blob as a ``dict[tuple[int, ...], Buffer | None]`` —
+    a mapping from inner-chunk coordinates to their encoded bytes.
+
+    For a non-sharded codec (``BytesCodec``), the mapping is trivial:
+    one entry at ``(0,)`` containing the entire blob. For a sharded
+    codec, the mapping has one entry per inner chunk, derived from the
+    shard index embedded in the blob. The pipeline doesn't need to know
+    which case it's dealing with — it operates on the mapping uniformly.
+
+    This abstraction enables the three-phase IO/compute/IO pattern:
+
+    1. **IO**: fetch the blob from the store.
+    2. **Compute**: unpack the blob into the chunk mapping, decode/merge/
+       re-encode entries, pack back into a blob. All pure compute.
+    3. **IO**: write the blob to the store.
     """
 
     @property
diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
index 1943bb0fe1..ac6dc3dd8e 100644
--- a/src/zarr/codecs/bytes.py
+++ b/src/zarr/codecs/bytes.py
@@ -127,7 +127,7 @@ async def _encode_single(
     def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         return input_byte_length
 
-    # -- SupportsChunkPacking --
+    # -- SupportsChunkMapping --
 
     @property
     def inner_codec_chain(self) -> SupportsChunkCodec | None:
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 8b9c73be03..13dd668c17 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -35,6 +35,7 @@
     numpy_buffer_prototype,
 )
 from zarr.core.chunk_grids import ChunkGrid, RegularChunkGrid
+from zarr.core.codec_pipeline import ChunkTransform
 from zarr.core.common import (
     ShapeLike,
     parse_enum,
@@ -423,10 +424,8 @@ def _get_inner_chunk_transform(self, shard_spec: ArraySpec) -> Any:
         evolved = tuple(c.evolve_from_array_spec(array_spec=chunk_spec) for c in self.codecs)
         return ChunkTransform(codecs=evolved, array_spec=chunk_spec)
 
-    def _get_index_chunk_transform(self, chunks_per_shard: tuple[int, ...]) -> Any:
+    def _get_index_chunk_transform(self, chunks_per_shard: tuple[int, ...]) -> ChunkTransform:
         """Build a ChunkTransform for index codecs."""
-        from zarr.core.codec_pipeline import ChunkTransform
-
         index_spec = self._get_index_chunk_spec(chunks_per_shard)
         evolved = tuple(c.evolve_from_array_spec(array_spec=index_spec) for c in self.index_codecs)
         return ChunkTransform(codecs=evolved, array_spec=index_spec)
@@ -523,7 +522,7 @@ def _encode_sync(
             morton_order_iter(chunks_per_shard)
         )
 
-        for chunk_coords, chunk_selection, out_selection, _ in indexer:
+        for chunk_coords, _chunk_selection, out_selection, _ in indexer:
             chunk_array = shard_array[out_selection]
             encoded = inner_transform.encode_chunk(chunk_array)
             shard_builder[chunk_coords] = encoded
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index 7d1915fd33..2a7a513379 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -205,7 +205,17 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None
             pass
 
     if isinstance(metadata, ArrayV3Metadata):
-        return get_pipeline_class().from_codecs(metadata.codecs)
+        pipeline = get_pipeline_class().from_codecs(metadata.codecs)
+        # PhasedCodecPipeline needs evolve_from_array_spec to build its
+        # ChunkTransform and ShardLayout. BatchedCodecPipeline does not.
+        if hasattr(pipeline, "chunk_transform") and pipeline.chunk_transform is None:
+            chunk_spec = metadata.get_chunk_spec(
+                (0,) * len(metadata.shape),
+                ArrayConfig.from_dict({}),
+                default_buffer_prototype(),
+            )
+            pipeline = pipeline.evolve_from_array_spec(chunk_spec)
+        return pipeline
     elif isinstance(metadata, ArrayV2Metadata):
         v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
         return get_pipeline_class().from_codecs([v2_codec])
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 33048d27fd..d2f646424f 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -6,6 +6,8 @@
 from typing import TYPE_CHECKING, Any
 from warnings import warn
 
+import numpy as np
+
 from zarr.abc.codec import (
     ArrayArrayCodec,
     ArrayBytesCodec,
@@ -17,6 +19,8 @@
     GetResult,
     SupportsSyncCodec,
 )
+from zarr.core.array_spec import ArraySpec
+from zarr.core.buffer import numpy_buffer_prototype
 from zarr.core.common import concurrent_map
 from zarr.core.config import config
 from zarr.core.indexing import SelectorTuple, is_scalar
@@ -28,7 +32,6 @@
     from typing import Self
 
     from zarr.abc.store import ByteGetter, ByteSetter
-    from zarr.core.array_spec import ArraySpec
     from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer
     from zarr.core.chunk_grids import ChunkGrid
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
@@ -683,43 +686,321 @@ def codecs_from_list(
 
 
 @dataclass(frozen=True)
-class PhasedCodecPipeline(CodecPipeline):
-    """Codec pipeline using the three-phase prepare/compute/finalize pattern.
+class ShardLayout:
+    """Configuration extracted from a ShardingCodec that tells the pipeline
+    how to interpret a stored blob as a collection of inner chunks.
+
+    This is a data structure, not an actor — the pipeline reads its fields
+    and handles all IO and compute itself.
+    """
+
+    inner_chunk_shape: tuple[int, ...]
+    chunks_per_shard: tuple[int, ...]
+    index_transform: ChunkTransform  # for encoding/decoding the shard index
+    inner_transform: ChunkTransform  # for encoding/decoding inner chunks
+    index_location: Any  # ShardingCodecIndexLocation
+    index_size: int  # byte size of the encoded shard index
+
+    def decode_index(self, index_bytes: Buffer) -> Any:
+        """Decode a shard index from bytes. Pure compute."""
+        from zarr.codecs.sharding import _ShardIndex
+
+        index_array = self.index_transform.decode_chunk(index_bytes)
+        return _ShardIndex(index_array.as_numpy_array())
+
+    def encode_index(self, index: Any) -> Buffer:
+        """Encode a shard index to bytes. Pure compute."""
+        from zarr.registry import get_ndbuffer_class
+
+        index_nd = get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths)
+        result = self.index_transform.encode_chunk(index_nd)
+        assert result is not None
+        return result
+
+    async def fetch_index(self, byte_getter: Any) -> Any:
+        """Fetch and decode the shard index via byte-range read. IO + compute."""
+        from zarr.abc.store import RangeByteRequest, SuffixByteRequest
+        from zarr.codecs.sharding import ShardingCodecIndexLocation
+
+        if self.index_location == ShardingCodecIndexLocation.start:
+            index_bytes = await byte_getter.get(
+                prototype=numpy_buffer_prototype(),
+                byte_range=RangeByteRequest(0, self.index_size),
+            )
+        else:
+            index_bytes = await byte_getter.get(
+                prototype=numpy_buffer_prototype(),
+                byte_range=SuffixByteRequest(self.index_size),
+            )
+        if index_bytes is None:
+            return None
+        return self.decode_index(index_bytes)
+
+    def fetch_index_sync(self, byte_getter: Any) -> Any:
+        """Sync variant of fetch_index."""
+        from zarr.abc.store import RangeByteRequest, SuffixByteRequest
+        from zarr.codecs.sharding import ShardingCodecIndexLocation
+
+        if self.index_location == ShardingCodecIndexLocation.start:
+            index_bytes = byte_getter.get_sync(
+                prototype=numpy_buffer_prototype(),
+                byte_range=RangeByteRequest(0, self.index_size),
+            )
+        else:
+            index_bytes = byte_getter.get_sync(
+                prototype=numpy_buffer_prototype(),
+                byte_range=SuffixByteRequest(self.index_size),
+            )
+        if index_bytes is None:
+            return None
+        return self.decode_index(index_bytes)
+
+    async def fetch_chunks(
+        self, byte_getter: Any, index: Any, needed_coords: set[tuple[int, ...]]
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Fetch only the needed inner chunks via byte-range reads, concurrently."""
+        from zarr.abc.store import RangeByteRequest
+        from zarr.core.buffer import default_buffer_prototype
+
+        coords_list = list(needed_coords)
+        slices = [index.get_chunk_slice(c) for c in coords_list]
+
+        async def _fetch_one(
+            coords: tuple[int, ...], chunk_slice: tuple[int, int] | None
+        ) -> tuple[tuple[int, ...], Buffer | None]:
+            if chunk_slice is not None:
+                chunk_bytes = await byte_getter.get(
+                    prototype=default_buffer_prototype(),
+                    byte_range=RangeByteRequest(chunk_slice[0], chunk_slice[1]),
+                )
+                return (coords, chunk_bytes)
+            return (coords, None)
+
+        fetched = await concurrent_map(
+            list(zip(coords_list, slices, strict=True)),
+            _fetch_one,
+            config.get("async.concurrency"),
+        )
+        return dict(fetched)
+
+    def fetch_chunks_sync(
+        self, byte_getter: Any, index: Any, needed_coords: set[tuple[int, ...]]
+    ) -> dict[tuple[int, ...], Buffer | None]:
+        """Sync variant of fetch_chunks."""
+        from zarr.abc.store import RangeByteRequest
+        from zarr.core.buffer import default_buffer_prototype
+
+        result: dict[tuple[int, ...], Buffer | None] = {}
+        for coords in needed_coords:
+            chunk_slice = index.get_chunk_slice(coords)
+            if chunk_slice is not None:
+                chunk_bytes = byte_getter.get_sync(
+                    prototype=default_buffer_prototype(),
+                    byte_range=RangeByteRequest(chunk_slice[0], chunk_slice[1]),
+                )
+                result[coords] = chunk_bytes
+            else:
+                result[coords] = None
+        return result
+
+    def unpack_blob(self, blob: Buffer) -> dict[tuple[int, ...], Buffer | None]:
+        """Unpack a shard blob into per-inner-chunk buffers. Pure compute."""
+        from zarr.codecs.sharding import ShardingCodecIndexLocation
+
+        if self.index_location == ShardingCodecIndexLocation.start:
+            index_bytes = blob[: self.index_size]
+        else:
+            index_bytes = blob[-self.index_size :]
+
+        index = self.decode_index(index_bytes)
+        result: dict[tuple[int, ...], Buffer | None] = {}
+        for chunk_coords in np.ndindex(self.chunks_per_shard):
+            chunk_slice = index.get_chunk_slice(chunk_coords)
+            if chunk_slice is not None:
+                result[chunk_coords] = blob[chunk_slice[0] : chunk_slice[1]]
+            else:
+                result[chunk_coords] = None
+        return result
 
-    Separates IO (prepare, finalize) from compute (encode, decode) so that
-    the compute phase can run without holding IO resources. This is the
-    foundation for thread-pool-based parallelism.
+    def pack_blob(
+        self, chunk_dict: dict[tuple[int, ...], Buffer | None], prototype: BufferPrototype
+    ) -> Buffer | None:
+        """Pack per-inner-chunk buffers into a shard blob. Pure compute."""
+        from zarr.codecs.sharding import MAX_UINT_64, ShardingCodecIndexLocation, _ShardIndex
+        from zarr.core.indexing import morton_order_iter
+
+        index = _ShardIndex.create_empty(self.chunks_per_shard)
+        buffers: list[Buffer] = []
+        template = prototype.buffer.create_zero_length()
+        chunk_start = 0
+
+        for chunk_coords in morton_order_iter(self.chunks_per_shard):
+            value = chunk_dict.get(chunk_coords)
+            if value is None or len(value) == 0:
+                continue
+            chunk_length = len(value)
+            buffers.append(value)
+            index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length))
+            chunk_start += chunk_length
+
+        if not buffers:
+            return None
+
+        index_bytes = self.encode_index(index)
+        if self.index_location == ShardingCodecIndexLocation.start:
+            empty_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64
+            index.offsets_and_lengths[~empty_mask, 0] += len(index_bytes)
+            index_bytes = self.encode_index(index)
+            buffers.insert(0, index_bytes)
+        else:
+            buffers.append(index_bytes)
 
-    Works with any ``ArrayBytesCodec``. The sync path (``read_sync`` /
-    ``write_sync``) requires ``SupportsChunkPacking`` and ``SupportsSyncCodec``.
+        return template.combine(buffers)
+
+    @classmethod
+    def from_sharding_codec(cls, codec: Any, shard_spec: ArraySpec) -> ShardLayout:
+        """Extract layout configuration from a ShardingCodec."""
+        chunk_shape = codec.chunk_shape
+        shard_shape = shard_spec.shape
+        chunks_per_shard = tuple(s // c for s, c in zip(shard_shape, chunk_shape, strict=True))
+
+        # Build inner chunk spec
+        inner_spec = ArraySpec(
+            shape=chunk_shape,
+            dtype=shard_spec.dtype,
+            fill_value=shard_spec.fill_value,
+            config=shard_spec.config,
+            prototype=shard_spec.prototype,
+        )
+        inner_evolved = tuple(c.evolve_from_array_spec(array_spec=inner_spec) for c in codec.codecs)
+        inner_transform = ChunkTransform(codecs=inner_evolved, array_spec=inner_spec)
+
+        # Build index spec and transform
+        from zarr.codecs.sharding import MAX_UINT_64
+        from zarr.core.array_spec import ArrayConfig
+        from zarr.core.buffer import default_buffer_prototype
+        from zarr.core.dtype.npy.int import UInt64
+
+        index_spec = ArraySpec(
+            shape=chunks_per_shard + (2,),
+            dtype=UInt64(endianness="little"),
+            fill_value=MAX_UINT_64,
+            config=ArrayConfig(order="C", write_empty_chunks=False),
+            prototype=default_buffer_prototype(),
+        )
+        index_evolved = tuple(
+            c.evolve_from_array_spec(array_spec=index_spec) for c in codec.index_codecs
+        )
+        index_transform = ChunkTransform(codecs=index_evolved, array_spec=index_spec)
+
+        # Compute index size
+        index_size = index_transform.compute_encoded_size(
+            16 * int(np.prod(chunks_per_shard)), index_spec
+        )
+
+        return cls(
+            inner_chunk_shape=chunk_shape,
+            chunks_per_shard=chunks_per_shard,
+            index_transform=index_transform,
+            inner_transform=inner_transform,
+            index_location=codec.index_location,
+            index_size=index_size,
+        )
+
+
+@dataclass(frozen=True)
+class PhasedCodecPipeline(CodecPipeline):
+    """Codec pipeline that cleanly separates IO from compute.
+
+    The zarr v3 spec describes each codec as a function that may perform
+    IO — the sharding codec, for example, is specified as reading and
+    writing inner chunks from storage. This framing suggests that IO is
+    distributed throughout the codec chain, making it difficult to
+    parallelize or optimize.
+
+    In practice, **codecs are pure compute**. Every codec transforms
+    bytes to bytes, bytes to arrays, or arrays to arrays — none of them
+    need to touch storage. The only IO happens at the pipeline level:
+    reading a blob from a store key, and writing a blob back. Even the
+    sharding codec is just a transform: it takes the full shard blob
+    (already fetched) and splits it into inner-chunk buffers using an
+    index, then decodes each inner chunk through its inner codec chain.
+    No additional IO occurs inside the codec.
+
+    This insight enables a strict three-phase architecture:
+
+    1. **IO phase** — fetch raw bytes from the store (one key per chunk
+       or shard). This is the only phase that touches storage.
+    2. **Compute phase** — decode, merge, and re-encode chunks through
+       the full codec chain, including sharding. This is pure CPU work
+       with no IO, and can safely run in a thread pool.
+    3. **IO phase** — write results back to the store.
+
+    Because the compute phase is IO-free, it can be parallelized with
+    threads (sync path) or ``asyncio.to_thread`` (async path) without
+    holding IO resources or risking deadlocks.
+
+    Nested sharding (a shard whose inner chunks are themselves shards)
+    works the same way: the outer shard blob is fetched once in phase 1,
+    then the compute phase unpacks it into inner shard blobs, each of
+    which is decoded by the inner sharding codec — still pure compute,
+    still no IO. The entire decode tree runs from the single blob
+    fetched in phase 1.
     """
 
     codecs: tuple[Codec, ...]
+    array_array_codecs: tuple[ArrayArrayCodec, ...]
+    array_bytes_codec: ArrayBytesCodec
+    bytes_bytes_codecs: tuple[BytesBytesCodec, ...]
     chunk_transform: ChunkTransform | None
+    shard_layout: ShardLayout | None
     batch_size: int
 
     @classmethod
     def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
+        """Create a pipeline from codecs.
+
+        The pipeline is not usable for read/write until ``evolve_from_array_spec``
+        is called with the chunk's ArraySpec. This matches the CodecPipeline ABC
+        contract.
+        """
         codec_list = tuple(codecs)
-        codecs_from_list(codec_list)  # validate codec ordering
+        aa, ab, bb = codecs_from_list(codec_list)
 
         if batch_size is None:
             batch_size = config.get("codec_pipeline.batch_size")
 
+        # chunk_transform and shard_layout require an ArraySpec.
+        # They'll be built in evolve_from_array_spec.
         return cls(
             codecs=codec_list,
+            array_array_codecs=aa,
+            array_bytes_codec=ab,
+            bytes_bytes_codecs=bb,
             chunk_transform=None,
+            shard_layout=None,
             batch_size=batch_size,
         )
 
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        from zarr.codecs.sharding import ShardingCodec
+
         evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=array_spec) for c in self.codecs)
-        # Only create ChunkTransform if all codecs support sync
-        all_sync = all(isinstance(c, SupportsSyncCodec) for c in evolved_codecs)
-        chunk_transform = ChunkTransform(codecs=evolved_codecs, array_spec=array_spec) if all_sync else None
+        aa, ab, bb = codecs_from_list(evolved_codecs)
+
+        chunk_transform = ChunkTransform(codecs=evolved_codecs, array_spec=array_spec)
+
+        shard_layout: ShardLayout | None = None
+        if isinstance(ab, ShardingCodec):
+            shard_layout = ShardLayout.from_sharding_codec(ab, array_spec)
+
         return type(self)(
             codecs=evolved_codecs,
+            array_array_codecs=aa,
+            array_bytes_codec=ab,
+            bytes_bytes_codecs=bb,
             chunk_transform=chunk_transform,
+            shard_layout=shard_layout,
             batch_size=self.batch_size,
         )
 
@@ -728,42 +1009,50 @@ def __iter__(self) -> Iterator[Codec]:
 
     @property
     def supports_partial_decode(self) -> bool:
-        ab = self._ab_codec
-        return isinstance(ab, ArrayBytesCodecPartialDecodeMixin)
+        return isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin)
 
     @property
     def supports_partial_encode(self) -> bool:
-        ab = self._ab_codec
-        return isinstance(ab, ArrayBytesCodecPartialEncodeMixin)
+        return isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin)
 
     def validate(
-        self, *, shape: tuple[int, ...], dtype: ZDType[TBaseDType, TBaseScalar], chunk_grid: ChunkGrid
+        self,
+        *,
+        shape: tuple[int, ...],
+        dtype: ZDType[TBaseDType, TBaseScalar],
+        chunk_grid: ChunkGrid,
     ) -> None:
         for codec in self.codecs:
             codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid)
 
     def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
-        if self.chunk_transform is not None:
-            return self.chunk_transform.compute_encoded_size(byte_length, array_spec)
-        return byte_length
+        if self.chunk_transform is None:
+            raise RuntimeError(
+                "Cannot compute encoded size before evolve_from_array_spec is called."
+            )
+        return self.chunk_transform.compute_encoded_size(byte_length, array_spec)
 
     async def decode(
         self,
         chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
     ) -> Iterable[NDBuffer | None]:
-        """Decode a batch of chunks through the full codec chain."""
-        aa, ab, bb = codecs_from_list(self.codecs)
+        """Decode a batch of chunks through the full codec chain.
+
+        Required by the ``CodecPipeline`` ABC. Not used internally by
+        this pipeline — reads go through ``_transform_read`` or
+        ``_read_shard_selective`` instead.
+        """
         chunk_bytes_batch: Iterable[Buffer | None]
         chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs)
 
-        for bb_codec in bb[::-1]:
+        for bb_codec in self.bytes_bytes_codecs[::-1]:
             chunk_bytes_batch = await bb_codec.decode(
                 zip(chunk_bytes_batch, chunk_specs, strict=False)
             )
-        chunk_array_batch = await ab.decode(
+        chunk_array_batch = await self.array_bytes_codec.decode(
             zip(chunk_bytes_batch, chunk_specs, strict=False)
         )
-        for aa_codec in aa[::-1]:
+        for aa_codec in self.array_array_codecs[::-1]:
             chunk_array_batch = await aa_codec.decode(
                 zip(chunk_array_batch, chunk_specs, strict=False)
             )
@@ -773,50 +1062,84 @@ async def encode(
         self,
         chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]],
     ) -> Iterable[Buffer | None]:
-        """Encode a batch of chunks through the full codec chain."""
-        aa, ab, bb = codecs_from_list(self.codecs)
+        """Encode a batch of chunks through the full codec chain.
+
+        Required by the ``CodecPipeline`` ABC. Not used internally by
+        this pipeline — writes go through ``_transform_write`` instead.
+        """
         chunk_array_batch: Iterable[NDBuffer | None]
         chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs)
 
-        for aa_codec in aa:
+        for aa_codec in self.array_array_codecs:
             chunk_array_batch = await aa_codec.encode(
                 zip(chunk_array_batch, chunk_specs, strict=False)
             )
-        chunk_bytes_batch = await ab.encode(
+        chunk_bytes_batch = await self.array_bytes_codec.encode(
             zip(chunk_array_batch, chunk_specs, strict=False)
         )
-        for bb_codec in bb:
+        for bb_codec in self.bytes_bytes_codecs:
             chunk_bytes_batch = await bb_codec.encode(
                 zip(chunk_bytes_batch, chunk_specs, strict=False)
             )
         return chunk_bytes_batch
 
-    @property
-    def _ab_codec(self) -> ArrayBytesCodec:
-        _, ab, _ = codecs_from_list(self.codecs)
-        return ab
-
     # -- Phase 2: pure compute (no IO) --
 
     def _transform_read(
         self,
         raw: Buffer | None,
-        _chunk_spec: ArraySpec,
+        chunk_spec: ArraySpec,
     ) -> NDBuffer | None:
         """Decode raw bytes into an array. Pure sync compute, no IO.
 
-        Requires ``chunk_transform`` (all codecs must support sync).
-        Raises ``RuntimeError`` if called without a chunk transform.
+        For non-sharded arrays, decodes through the full codec chain.
+        For sharded arrays, unpacks the shard blob using the layout,
+        decodes each inner chunk through the inner transform, and
+        assembles the shard-shaped output.
         """
         if raw is None:
             return None
-        if self.chunk_transform is None:
-            raise RuntimeError(
-                "Cannot call _transform_read without a ChunkTransform. "
-                "All codecs must implement SupportsSyncCodec for sync compute."
-            )
+
+        if self.shard_layout is not None:
+            return self._decode_shard(raw, chunk_spec, self.shard_layout)
+
+        assert self.chunk_transform is not None
         return self.chunk_transform.decode_chunk(raw)
 
+    def _decode_shard(self, blob: Buffer, shard_spec: ArraySpec, layout: ShardLayout) -> NDBuffer:
+        """Decode a full shard blob into a shard-shaped array. Pure compute.
+
+        Used by the write path (via ``_transform_read``) to decode existing
+        shard data before merging. For reads, ``_read_shard_selective`` is
+        preferred since it fetches only the needed inner chunks.
+        """
+        from zarr.core.chunk_grids import RegularChunkGrid
+        from zarr.core.indexing import BasicIndexer
+
+        chunk_dict = layout.unpack_blob(blob)
+
+        out = shard_spec.prototype.nd_buffer.empty(
+            shape=shard_spec.shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        indexer = BasicIndexer(
+            tuple(slice(0, s) for s in shard_spec.shape),
+            shape=shard_spec.shape,
+            chunk_grid=RegularChunkGrid(chunk_shape=layout.inner_chunk_shape),
+        )
+
+        for chunk_coords, chunk_selection, out_selection, _ in indexer:
+            chunk_bytes = chunk_dict.get(chunk_coords)
+            if chunk_bytes is not None:
+                chunk_array = layout.inner_transform.decode_chunk(chunk_bytes)
+                out[out_selection] = chunk_array[chunk_selection]
+            else:
+                out[out_selection] = shard_spec.fill_value
+
+        return out
+
     def _transform_write(
         self,
         existing: Buffer | None,
@@ -826,17 +1149,20 @@ def _transform_write(
         value: NDBuffer,
         drop_axes: tuple[int, ...],
     ) -> Buffer | None:
-        """Decode existing, merge new data, re-encode. Pure sync compute, no IO.
-
-        Requires ``chunk_transform`` (all codecs must support sync).
-        Raises ``RuntimeError`` if called without a chunk transform.
-        """
-        if self.chunk_transform is None:
-            raise RuntimeError(
-                "Cannot call _transform_write without a ChunkTransform. "
-                "All codecs must implement SupportsSyncCodec for sync compute."
+        """Decode existing, merge new data, re-encode. Pure sync compute, no IO."""
+        if self.shard_layout is not None:
+            return self._transform_write_shard(
+                existing,
+                chunk_spec,
+                chunk_selection,
+                out_selection,
+                value,
+                drop_axes,
+                self.shard_layout,
             )
 
+        assert self.chunk_transform is not None
+
         if existing is not None:
             chunk_array: NDBuffer | None = self.chunk_transform.decode_chunk(existing)
         else:
@@ -849,15 +1175,97 @@ def _transform_write(
                 fill_value=fill_value_or_default(chunk_spec),
             )
 
-        # Merge new data
-        if drop_axes:
-            chunk_value = value[out_selection]
-            chunk_array[chunk_selection] = chunk_value.squeeze(axis=drop_axes)
+        if chunk_selection == () or is_scalar(
+            value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
+        ):
+            chunk_value = value
         else:
-            chunk_array[chunk_selection] = value[out_selection]
+            chunk_value = value[out_selection]
+            if drop_axes:
+                item = tuple(
+                    None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim)
+                )
+                chunk_value = chunk_value[item]
+        chunk_array[chunk_selection] = chunk_value
 
         return self.chunk_transform.encode_chunk(chunk_array)
 
+    def _transform_write_shard(
+        self,
+        existing: Buffer | None,
+        shard_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        out_selection: SelectorTuple,
+        value: NDBuffer,
+        drop_axes: tuple[int, ...],
+        layout: ShardLayout,
+    ) -> Buffer | None:
+        """Write into a shard, only decoding/encoding the affected inner chunks.
+
+        Operates at the chunk mapping level: the existing shard blob is
+        unpacked into a mapping of inner-chunk coordinates to raw bytes.
+        Only inner chunks touched by the selection are decoded, merged,
+        and re-encoded. Untouched chunks pass through as raw bytes.
+        """
+        from zarr.core.buffer import default_buffer_prototype
+        from zarr.core.chunk_grids import RegularChunkGrid
+        from zarr.core.indexing import get_indexer
+
+        # Unpack existing shard into chunk mapping (no decode — just index parse + byte slicing)
+        if existing is not None:
+            chunk_dict = layout.unpack_blob(existing)
+        else:
+            chunk_dict = dict.fromkeys(np.ndindex(layout.chunks_per_shard))
+
+        # Determine which inner chunks are affected by the write selection
+        indexer = get_indexer(
+            chunk_selection,
+            shape=shard_spec.shape,
+            chunk_grid=RegularChunkGrid(chunk_shape=layout.inner_chunk_shape),
+        )
+
+        inner_spec = ArraySpec(
+            shape=layout.inner_chunk_shape,
+            dtype=shard_spec.dtype,
+            fill_value=shard_spec.fill_value,
+            config=shard_spec.config,
+            prototype=shard_spec.prototype,
+        )
+
+        # Only decode, merge, re-encode the affected inner chunks
+        for inner_coords, inner_sel, value_sel, _ in indexer:
+            existing_bytes = chunk_dict.get(inner_coords)
+
+            # Decode just this inner chunk
+            if existing_bytes is not None:
+                inner_array = layout.inner_transform.decode_chunk(existing_bytes)
+            else:
+                inner_array = inner_spec.prototype.nd_buffer.create(
+                    shape=inner_spec.shape,
+                    dtype=inner_spec.dtype.to_native_dtype(),
+                    fill_value=fill_value_or_default(inner_spec),
+                )
+
+            # Merge new data into this inner chunk
+            if inner_sel == () or is_scalar(
+                value.as_ndarray_like(), inner_spec.dtype.to_native_dtype()
+            ):
+                inner_value = value
+            else:
+                inner_value = value[value_sel]
+                if drop_axes:
+                    item = tuple(
+                        None if idx in drop_axes else slice(None) for idx in range(inner_spec.ndim)
+                    )
+                    inner_value = inner_value[item]
+            inner_array[inner_sel] = inner_value
+
+            # Re-encode just this inner chunk
+            chunk_dict[inner_coords] = layout.inner_transform.encode_chunk(inner_array)
+
+        # Pack the mapping back into a blob (untouched chunks pass through as raw bytes)
+        return layout.pack_blob(chunk_dict, default_buffer_prototype())
+
     # -- Phase 3: scatter (read) / store (write) --
 
     @staticmethod
@@ -885,6 +1293,58 @@ def _scatter(
 
     # -- Async API --
 
+    async def _read_shard_selective(
+        self,
+        byte_getter: Any,
+        shard_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        layout: ShardLayout,
+    ) -> NDBuffer | None:
+        """Read from a shard fetching only the needed inner chunks.
+
+        1. Fetch shard index (byte-range read)
+        2. Determine which inner chunks are needed
+        3. Fetch only those inner chunks (byte-range reads)
+        4. Decode and assemble (pure compute)
+        """
+        from zarr.core.chunk_grids import RegularChunkGrid
+        from zarr.core.indexing import get_indexer
+
+        # Phase 1: fetch index
+        index = await layout.fetch_index(byte_getter)
+        if index is None:
+            return None
+
+        # Determine needed inner chunks
+        indexer = list(
+            get_indexer(
+                chunk_selection,
+                shape=shard_spec.shape,
+                chunk_grid=RegularChunkGrid(chunk_shape=layout.inner_chunk_shape),
+            )
+        )
+        needed_coords = {coords for coords, *_ in indexer}
+
+        # Phase 2: fetch only needed inner chunks
+        chunk_dict = await layout.fetch_chunks(byte_getter, index, needed_coords)
+
+        # Phase 3: decode and assemble
+        out = shard_spec.prototype.nd_buffer.empty(
+            shape=shard_spec.shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        for inner_coords, inner_sel, out_sel, _ in indexer:
+            chunk_bytes = chunk_dict.get(inner_coords)
+            if chunk_bytes is not None:
+                inner_array = layout.inner_transform.decode_chunk(chunk_bytes)
+                out[out_sel] = inner_array[inner_sel]
+            else:
+                out[out_sel] = shard_spec.fill_value
+
+        return out
+
     async def read(
         self,
         batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
@@ -895,29 +1355,39 @@ async def read(
         if not batch:
             return ()
 
-        # Phase 1: IO — fetch all raw bytes concurrently
-        raw_buffers: list[Buffer | None] = await concurrent_map(
-            [(bg, cs.prototype) for bg, cs, *_ in batch],
-            lambda bg, proto: bg.get(prototype=proto),
-            config.get("async.concurrency"),
-        )
-
-        # Phase 2: compute — decode all chunks
-        if self.chunk_transform is not None:
-            # All codecs support sync — offload to threads for parallelism
+        if self.shard_layout is not None:
+            # Sharded: use selective byte-range reads per shard
+            decoded: list[NDBuffer | None] = list(
+                await concurrent_map(
+                    [(bg, cs, chunk_sel, self.shard_layout) for bg, cs, chunk_sel, _, _ in batch],
+                    self._read_shard_selective,
+                    config.get("async.concurrency"),
+                )
+            )
+        elif len(batch) == 1:
+            # Non-sharded single chunk: fetch and decode inline
+            bg, cs, _, _, _ = batch[0]
+            raw = await bg.get(prototype=cs.prototype)
+            decoded = [self._transform_read(raw, cs)]
+        else:
+            # Non-sharded multiple chunks: fetch all, decode in parallel threads
             import asyncio
 
-            decoded: list[NDBuffer | None] = list(await asyncio.gather(*[
-                asyncio.to_thread(self._transform_read, raw, cs)
-                for raw, (_, cs, *_) in zip(raw_buffers, batch, strict=True)
-            ]))
-        else:
-            # Some codecs are async-only — decode inline (no threading, no deadlock)
-            decoded = list(await self.decode(
-                zip(raw_buffers, [cs for _, cs, *_ in batch], strict=False)
-            ))
+            raw_buffers: list[Buffer | None] = await concurrent_map(
+                [(bg, cs.prototype) for bg, cs, *_ in batch],
+                lambda bg, proto: bg.get(prototype=proto),
+                config.get("async.concurrency"),
+            )
+            decoded = list(
+                await asyncio.gather(
+                    *[
+                        asyncio.to_thread(self._transform_read, raw, cs)
+                        for raw, (_, cs, *_) in zip(raw_buffers, batch, strict=True)
+                    ]
+                )
+            )
 
-        # Phase 3: scatter
+        # Scatter
         return self._scatter(batch, decoded, out, drop_axes)
 
     async def write(
@@ -945,45 +1415,26 @@ async def _fetch_existing(
         )
 
         # Phase 2: compute — decode, merge, re-encode
-        if self.chunk_transform is not None:
-            # All codecs support sync — offload to threads for parallelism
+        if len(batch) == 1:
+            _, cs, csel, osel, _ = batch[0]
+            blobs: list[Buffer | None] = [
+                self._transform_write(existing_buffers[0], cs, csel, osel, value, drop_axes)
+            ]
+        else:
             import asyncio
 
-            blobs: list[Buffer | None] = list(await asyncio.gather(*[
-                asyncio.to_thread(
-                    self._transform_write, existing, cs, csel, osel, value, drop_axes
+            blobs = list(
+                await asyncio.gather(
+                    *[
+                        asyncio.to_thread(
+                            self._transform_write, existing, cs, csel, osel, value, drop_axes
+                        )
+                        for existing, (_, cs, csel, osel, _) in zip(
+                            existing_buffers, batch, strict=True
+                        )
+                    ]
                 )
-                for existing, (_, cs, csel, osel, _) in zip(
-                    existing_buffers, batch, strict=True
-                )
-            ]))
-        else:
-            # Some codecs are async-only — encode inline (no threading, no deadlock)
-            blobs = []
-            for existing, (_, cs, csel, osel, _) in zip(
-                existing_buffers, batch, strict=True
-            ):
-                if existing is not None:
-                    chunk_array_batch = await self.decode([(existing, cs)])
-                    chunk_array = next(iter(chunk_array_batch))
-                else:
-                    chunk_array = None
-
-                if chunk_array is None:
-                    chunk_array = cs.prototype.nd_buffer.create(
-                        shape=cs.shape,
-                        dtype=cs.dtype.to_native_dtype(),
-                        fill_value=fill_value_or_default(cs),
-                    )
-
-                if drop_axes:
-                    chunk_value = value[osel]
-                    chunk_array[csel] = chunk_value.squeeze(axis=drop_axes)
-                else:
-                    chunk_array[csel] = value[osel]
-
-                encoded_batch = await self.encode([(chunk_array, cs)])
-                blobs.append(next(iter(encoded_batch)))
+            )
 
         # Phase 3: IO — write results concurrently
         async def _store_one(byte_setter: ByteSetter, blob: Buffer | None) -> None:
@@ -1000,6 +1451,48 @@ async def _store_one(byte_setter: ByteSetter, blob: Buffer | None) -> None:
 
     # -- Sync API --
 
+    def _read_shard_selective_sync(
+        self,
+        byte_getter: Any,
+        shard_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        layout: ShardLayout,
+    ) -> NDBuffer | None:
+        """Sync variant of _read_shard_selective."""
+        from zarr.core.chunk_grids import RegularChunkGrid
+        from zarr.core.indexing import get_indexer
+
+        index = layout.fetch_index_sync(byte_getter)
+        if index is None:
+            return None
+
+        indexer = list(
+            get_indexer(
+                chunk_selection,
+                shape=shard_spec.shape,
+                chunk_grid=RegularChunkGrid(chunk_shape=layout.inner_chunk_shape),
+            )
+        )
+        needed_coords = {coords for coords, *_ in indexer}
+
+        chunk_dict = layout.fetch_chunks_sync(byte_getter, index, needed_coords)
+
+        out = shard_spec.prototype.nd_buffer.empty(
+            shape=shard_spec.shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        for inner_coords, inner_sel, out_sel, _ in indexer:
+            chunk_bytes = chunk_dict.get(inner_coords)
+            if chunk_bytes is not None:
+                inner_array = layout.inner_transform.decode_chunk(chunk_bytes)
+                out[out_sel] = inner_array[inner_sel]
+            else:
+                out[out_sel] = shard_spec.fill_value
+
+        return out
+
     def read_sync(
         self,
         batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
@@ -1007,28 +1500,34 @@ def read_sync(
         drop_axes: tuple[int, ...] = (),
         n_workers: int = 0,
     ) -> None:
-        """Synchronous read. Same three phases as async, different IO wrapper."""
+        """Synchronous read."""
         batch = list(batch_info)
         if not batch:
             return
 
-        # Phase 1: IO — fetch all raw bytes serially
-        raw_buffers: list[Buffer | None] = [
-            bg.get_sync(prototype=cs.prototype) for bg, cs, *_ in batch
-        ]
-
-        # Phase 2: compute — decode (optionally threaded)
-        specs = [cs for _, cs, *_ in batch]
-        if n_workers > 0 and len(batch) > 1:
-            with ThreadPoolExecutor(max_workers=n_workers) as pool:
-                decoded = list(pool.map(self._transform_read, raw_buffers, specs))
+        if self.shard_layout is not None:
+            # Sharded: selective byte-range reads per shard
+            decoded: list[NDBuffer | None] = [
+                self._read_shard_selective_sync(bg, cs, chunk_sel, self.shard_layout)
+                for bg, cs, chunk_sel, _, _ in batch
+            ]
         else:
-            decoded = [
-                self._transform_read(raw, cs)
-                for raw, cs in zip(raw_buffers, specs, strict=True)
+            # Non-sharded: fetch full blobs, decode (optionally threaded)
+            raw_buffers: list[Buffer | None] = [
+                bg.get_sync(prototype=cs.prototype)  # type: ignore[attr-defined]
+                for bg, cs, *_ in batch
             ]
+            specs = [cs for _, cs, *_ in batch]
+            if n_workers > 0 and len(batch) > 1:
+                with ThreadPoolExecutor(max_workers=n_workers) as pool:
+                    decoded = list(pool.map(self._transform_read, raw_buffers, specs))
+            else:
+                decoded = [
+                    self._transform_read(raw, cs)
+                    for raw, cs in zip(raw_buffers, specs, strict=True)
+                ]
 
-        # Phase 3: scatter
+        # Scatter
         self._scatter(batch, decoded, out, drop_axes)
 
     def write_sync(
@@ -1045,7 +1544,7 @@ def write_sync(
 
         # Phase 1: IO — fetch existing bytes serially
         existing_buffers: list[Buffer | None] = [
-            None if ic else bs.get_sync(prototype=cs.prototype)
+            None if ic else bs.get_sync(prototype=cs.prototype)  # type: ignore[attr-defined]
             for bs, cs, _, _, ic in batch
         ]
 
@@ -1064,6 +1563,9 @@ def _compute(idx: int) -> Buffer | None:
         # Phase 3: IO — write results serially
         for (bs, *_), blob in zip(batch, blobs, strict=True):
             if blob is None:
-                bs.delete_sync()
+                bs.delete_sync()  # type: ignore[attr-defined]
             else:
-                bs.set_sync(blob)
+                bs.set_sync(blob)  # type: ignore[attr-defined]
+
+
+register_pipeline(PhasedCodecPipeline)
diff --git a/tests/test_phased_codec_pipeline.py b/tests/test_phased_codec_pipeline.py
index 2b81787858..902cc2ff20 100644
--- a/tests/test_phased_codec_pipeline.py
+++ b/tests/test_phased_codec_pipeline.py
@@ -22,12 +22,12 @@ def _create_array(
     chunks: tuple[int, ...] | None = None,
     codecs: tuple[Any, ...] = (BytesCodec(),),
     fill_value: object = 0,
-) -> zarr.Array:
+) -> zarr.Array[Any]:
     """Create a zarr array using PhasedCodecPipeline."""
     if chunks is None:
         chunks = shape
 
-    pipeline = PhasedCodecPipeline.from_codecs(codecs)
+    _ = PhasedCodecPipeline.from_codecs(codecs)
 
     return zarr.create_array(
         StorePath(MemoryStore()),
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
index 8eaeff7989..5d05190a95 100644
--- a/tests/test_pipeline_benchmark.py
+++ b/tests/test_pipeline_benchmark.py
@@ -6,12 +6,11 @@
 from __future__ import annotations
 
 from enum import Enum
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import pytest
 
-from zarr.abc.codec import Codec
 from zarr.codecs.bytes import BytesCodec
 from zarr.codecs.gzip import GzipCodec
 from zarr.codecs.sharding import ShardingCodec
@@ -23,6 +22,9 @@
 from zarr.core.sync import sync
 from zarr.storage import MemoryStore, StorePath
 
+if TYPE_CHECKING:
+    from zarr.abc.codec import Codec
+
 
 class PipelineKind(Enum):
     batched = "batched"
@@ -78,7 +80,7 @@ def _make_pipeline(
         evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=spec) for c in pipeline)
         return BatchedCodecPipeline.from_codecs(evolved_codecs)
     else:  # phased_async, phased_sync, phased_sync_threaded
-        pipeline = PhasedCodecPipeline.from_codecs(codecs)
+        pipeline = PhasedCodecPipeline.from_codecs(codecs)  # type: ignore[assignment]
         return pipeline.evolve_from_array_spec(spec)
 
 
@@ -145,9 +147,12 @@ def test_pipeline(
     """1 MB per chunk, parametrized over pipeline, compressor, serializer, and chunk count."""
     codecs = _build_codecs(compressor, serializer)
 
-    # Sync paths require SupportsChunkPacking for the BytesCodec-level IO
-    # ShardingCodec now has _decode_sync/_encode_sync but not SupportsChunkPacking
-    if serializer == "sharding" and kind in (PipelineKind.phased_sync, PipelineKind.phased_sync_threaded):
+    # Sync paths require SupportsChunkMapping for the BytesCodec-level IO
+    # ShardingCodec now has _decode_sync/_encode_sync but not SupportsChunkMapping
+    if serializer == "sharding" and kind in (
+        PipelineKind.phased_sync,
+        PipelineKind.phased_sync_threaded,
+    ):
         pytest.skip("Sync IO path not yet implemented for ShardingCodec")
 
     # Threading only helps with multiple chunks

From c731cf2de044c2684cdd50a9d4b1ec1ee4c9b050 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Wed, 8 Apr 2026 19:51:57 +0200
Subject: [PATCH 04/22] fix: handle rectilinear chunks

---
 src/zarr/core/array.py          |  7 +++-
 src/zarr/core/codec_pipeline.py | 61 ++++++++++++++++++++++++++-------
 2 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index 676f133900..d52d27afd6 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -234,10 +234,15 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None
         if hasattr(pipeline, "chunk_transform") and pipeline.chunk_transform is None:
             from zarr.core.metadata.v3 import RegularChunkGridMetadata
 
+            # Use the regular chunk shape if available, otherwise use a
+            # placeholder shape. The ChunkTransform is shape-agnostic —
+            # the actual chunk shape is passed per-call at decode/encode time.
             if isinstance(metadata.chunk_grid, RegularChunkGridMetadata):
                 chunk_shape = metadata.chunk_grid.chunk_shape
             else:
-                chunk_shape = metadata.shape  # fallback for rectilinear
+                # Rectilinear: use a 1-element shape per dimension as placeholder.
+                # Only dtype/fill_value/config matter for codec evolution.
+                chunk_shape = (1,) * len(metadata.shape)
             chunk_spec = ArraySpec(
                 shape=chunk_shape,
                 dtype=metadata.data_type,
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 3a459d79f7..57266df75c 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from concurrent.futures import ThreadPoolExecutor
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from itertools import islice, pairwise
 from typing import TYPE_CHECKING, Any
 from warnings import warn
@@ -122,47 +122,78 @@ def __post_init__(self) -> None:
             bb_sync.append(bb_codec)
         self._bb_codecs = tuple(bb_sync)
 
+    def _spec_for_shape(self, shape: tuple[int, ...]) -> ArraySpec:
+        """Build an ArraySpec with the given shape, inheriting dtype/fill/config/prototype."""
+        if shape == self._ab_spec.shape:
+            return self._ab_spec
+        return replace(self._ab_spec, shape=shape)
+
     def decode_chunk(
         self,
         chunk_bytes: Buffer,
+        chunk_shape: tuple[int, ...] | None = None,
     ) -> NDBuffer:
         """Decode a single chunk through the full codec chain, synchronously.
 
         Pure compute -- no IO.
+
+        Parameters
+        ----------
+        chunk_bytes : Buffer
+            The encoded chunk bytes.
+        chunk_shape : tuple[int, ...] or None
+            The shape of this chunk. If None, uses the shape from the
+            ArraySpec provided at construction. Required for rectilinear
+            grids where chunks have different shapes.
         """
+        spec = self._ab_spec if chunk_shape is None else self._spec_for_shape(chunk_shape)
+
         data: Buffer = chunk_bytes
         for bb_codec in reversed(self._bb_codecs):
-            data = bb_codec._decode_sync(data, self._ab_spec)
+            data = bb_codec._decode_sync(data, spec)
 
-        chunk_array: NDBuffer = self._ab_codec._decode_sync(data, self._ab_spec)
+        chunk_array: NDBuffer = self._ab_codec._decode_sync(data, spec)
 
-        for aa_codec, spec in reversed(self._aa_codecs):
-            chunk_array = aa_codec._decode_sync(chunk_array, spec)
+        for aa_codec, aa_spec in reversed(self._aa_codecs):
+            aa_spec_resolved = aa_spec if chunk_shape is None else self._spec_for_shape(chunk_shape)
+            chunk_array = aa_codec._decode_sync(chunk_array, aa_spec_resolved)
 
         return chunk_array
 
     def encode_chunk(
         self,
         chunk_array: NDBuffer,
+        chunk_shape: tuple[int, ...] | None = None,
     ) -> Buffer | None:
         """Encode a single chunk through the full codec chain, synchronously.
 
         Pure compute -- no IO.
+
+        Parameters
+        ----------
+        chunk_array : NDBuffer
+            The chunk data to encode.
+        chunk_shape : tuple[int, ...] or None
+            The shape of this chunk. If None, uses the shape from the
+            ArraySpec provided at construction.
         """
+        spec = self._ab_spec if chunk_shape is None else self._spec_for_shape(chunk_shape)
+
         aa_data: NDBuffer = chunk_array
-        for aa_codec, spec in self._aa_codecs:
-            aa_result = aa_codec._encode_sync(aa_data, spec)
+        for aa_codec, aa_spec in self._aa_codecs:
+            aa_spec_resolved = aa_spec if chunk_shape is None else self._spec_for_shape(chunk_shape)
+            aa_result = aa_codec._encode_sync(aa_data, aa_spec_resolved)
             if aa_result is None:
                 return None
             aa_data = aa_result
 
-        ab_result = self._ab_codec._encode_sync(aa_data, self._ab_spec)
+        ab_result = self._ab_codec._encode_sync(aa_data, spec)
         if ab_result is None:
             return None
 
         bb_data: Buffer = ab_result
         for bb_codec in self._bb_codecs:
-            bb_result = bb_codec._encode_sync(bb_data, self._ab_spec)
+            bb_result = bb_codec._encode_sync(bb_data, spec)
             if bb_result is None:
                 return None
             bb_data = bb_result
@@ -1104,7 +1135,7 @@ def _transform_read(
             return self._decode_shard(raw, chunk_spec, self.shard_layout)
 
         assert self.chunk_transform is not None
-        return self.chunk_transform.decode_chunk(raw)
+        return self.chunk_transform.decode_chunk(raw, chunk_shape=chunk_spec.shape)
 
     def _decode_shard(self, blob: Buffer, shard_spec: ArraySpec, layout: ShardLayout) -> NDBuffer:
         """Decode a full shard blob into a shard-shaped array. Pure compute.
@@ -1163,14 +1194,18 @@ def _transform_write(
 
         assert self.chunk_transform is not None
 
+        chunk_shape = chunk_spec.shape
+
         if existing is not None:
-            chunk_array: NDBuffer | None = self.chunk_transform.decode_chunk(existing)
+            chunk_array: NDBuffer | None = self.chunk_transform.decode_chunk(
+                existing, chunk_shape=chunk_shape
+            )
         else:
             chunk_array = None
 
         if chunk_array is None:
             chunk_array = chunk_spec.prototype.nd_buffer.create(
-                shape=chunk_spec.shape,
+                shape=chunk_shape,
                 dtype=chunk_spec.dtype.to_native_dtype(),
                 fill_value=fill_value_or_default(chunk_spec),
             )
@@ -1188,7 +1223,7 @@ def _transform_write(
                 chunk_value = chunk_value[item]
         chunk_array[chunk_selection] = chunk_value
 
-        return self.chunk_transform.encode_chunk(chunk_array)
+        return self.chunk_transform.encode_chunk(chunk_array, chunk_shape=chunk_shape)
 
     def _transform_write_shard(
         self,

From ae0580c9442cdfde66f3d318108f72bcd6a426d2 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 9 Apr 2026 10:38:17 +0200
Subject: [PATCH 05/22] fixup

---
 src/zarr/abc/codec.py           | 118 +++-----------------------------
 src/zarr/codecs/bytes.py        | 116 +------------------------------
 src/zarr/core/array.py          |  40 +++++------
 src/zarr/core/codec_pipeline.py |   4 +-
 4 files changed, 32 insertions(+), 246 deletions(-)

diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
index c9713daa6a..ae8a78a34d 100644
--- a/src/zarr/abc/codec.py
+++ b/src/zarr/abc/codec.py
@@ -14,7 +14,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Awaitable, Callable, Iterable
-    from typing import Any, Self
+    from typing import Self
 
     from zarr.abc.store import ByteGetter, ByteSetter, Store
     from zarr.core.array_spec import ArraySpec
@@ -36,7 +36,6 @@
     "GetResult",
     "PreparedWrite",
     "SupportsChunkCodec",
-    "SupportsChunkMapping",
     "SupportsSyncCodec",
 ]
 
@@ -89,117 +88,20 @@ def _encode_sync(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None: ...
 class SupportsChunkCodec(Protocol):
     """Protocol for objects that can decode/encode whole chunks synchronously.
 
-    `ChunkTransform` satisfies this protocol.
+    `ChunkTransform` satisfies this protocol. The ``chunk_shape`` parameter
+    allows decoding/encoding chunks of different shapes (e.g. rectilinear
+    grids) without rebuilding the transform.
     """
 
     array_spec: ArraySpec
 
-    def decode_chunk(self, chunk_bytes: Buffer) -> NDBuffer: ...
+    def decode_chunk(
+        self, chunk_bytes: Buffer, chunk_shape: tuple[int, ...] | None = None
+    ) -> NDBuffer: ...
 
-    def encode_chunk(self, chunk_array: NDBuffer) -> Buffer | None: ...
-
-
-@runtime_checkable
-class SupportsChunkMapping(Protocol):
-    """Protocol for codecs that expose their stored data as a mapping
-    from chunk coordinates to encoded buffers.
-
-    A single store key holds a blob. This protocol defines how to
-    interpret that blob as a ``dict[tuple[int, ...], Buffer | None]`` —
-    a mapping from inner-chunk coordinates to their encoded bytes.
-
-    For a non-sharded codec (``BytesCodec``), the mapping is trivial:
-    one entry at ``(0,)`` containing the entire blob. For a sharded
-    codec, the mapping has one entry per inner chunk, derived from the
-    shard index embedded in the blob. The pipeline doesn't need to know
-    which case it's dealing with — it operates on the mapping uniformly.
-
-    This abstraction enables the three-phase IO/compute/IO pattern:
-
-    1. **IO**: fetch the blob from the store.
-    2. **Compute**: unpack the blob into the chunk mapping, decode/merge/
-       re-encode entries, pack back into a blob. All pure compute.
-    3. **IO**: write the blob to the store.
-    """
-
-    @property
-    def inner_codec_chain(self) -> SupportsChunkCodec | None:
-        """The codec chain for inner chunks, or `None` to use the pipeline's."""
-        ...
-
-    def unpack_chunks(
-        self,
-        raw: Buffer | None,
-        chunk_spec: ArraySpec,
-    ) -> dict[tuple[int, ...], Buffer | None]:
-        """Unpack a storage blob into per-inner-chunk encoded buffers."""
-        ...
-
-    def pack_chunks(
-        self,
-        chunk_dict: dict[tuple[int, ...], Buffer | None],
-        chunk_spec: ArraySpec,
-    ) -> Buffer | None:
-        """Pack per-inner-chunk encoded buffers into a single storage blob."""
-        ...
-
-    def prepare_read_sync(
-        self,
-        byte_getter: Any,
-        chunk_selection: SelectorTuple,
-        codec_chain: SupportsChunkCodec,
-    ) -> NDBuffer | None:
-        """Fetch and decode a chunk synchronously, returning the selected region."""
-        ...
-
-    def prepare_write_sync(
-        self,
-        byte_setter: Any,
-        codec_chain: SupportsChunkCodec,
-        chunk_selection: SelectorTuple,
-        out_selection: SelectorTuple,
-        replace: bool,
-    ) -> PreparedWrite:
-        """Prepare a synchronous write: fetch existing data if needed, unpack."""
-        ...
-
-    def finalize_write_sync(
-        self,
-        prepared: PreparedWrite,
-        chunk_spec: ArraySpec,
-        byte_setter: Any,
-    ) -> None:
-        """Pack the prepared chunk data and write it to the store."""
-        ...
-
-    async def prepare_read(
-        self,
-        byte_getter: Any,
-        chunk_selection: SelectorTuple,
-        codec_chain: SupportsChunkCodec,
-    ) -> NDBuffer | None:
-        """Async variant of `prepare_read_sync`."""
-        ...
-
-    async def prepare_write(
-        self,
-        byte_setter: Any,
-        codec_chain: SupportsChunkCodec,
-        chunk_selection: SelectorTuple,
-        out_selection: SelectorTuple,
-        replace: bool,
-    ) -> PreparedWrite:
-        """Async variant of `prepare_write_sync`."""
-        ...
-
-    async def finalize_write(
-        self,
-        prepared: PreparedWrite,
-        chunk_spec: ArraySpec,
-        byte_setter: Any,
-    ) -> None:
-        """Async variant of `finalize_write_sync`."""
-        ...
+    def encode_chunk(
+        self, chunk_array: NDBuffer, chunk_shape: tuple[int, ...] | None = None
+    ) -> Buffer | None: ...
 
 
 class BaseCodec[CI: CodecInput, CO: CodecOutput](Metadata):
diff --git a/src/zarr/codecs/bytes.py b/src/zarr/codecs/bytes.py
index ac6dc3dd8e..86bb354fb5 100644
--- a/src/zarr/codecs/bytes.py
+++ b/src/zarr/codecs/bytes.py
@@ -5,16 +5,15 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
-from zarr.abc.codec import ArrayBytesCodec, PreparedWrite, SupportsChunkCodec
+from zarr.abc.codec import ArrayBytesCodec
 from zarr.core.buffer import Buffer, NDBuffer
 from zarr.core.common import JSON, parse_enum, parse_named_configuration
 from zarr.core.dtype.common import HasEndianness
 
 if TYPE_CHECKING:
-    from typing import Any, Self
+    from typing import Self
 
     from zarr.core.array_spec import ArraySpec
-    from zarr.core.indexing import SelectorTuple
 
 
 class Endian(Enum):
@@ -126,114 +125,3 @@ async def _encode_single(
 
     def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         return input_byte_length
-
-    # -- SupportsChunkMapping --
-
-    @property
-    def inner_codec_chain(self) -> SupportsChunkCodec | None:
-        """Returns `None` — the pipeline should use its own codec chain."""
-        return None
-
-    def unpack_chunks(
-        self,
-        raw: Buffer | None,
-        chunk_spec: ArraySpec,
-    ) -> dict[tuple[int, ...], Buffer | None]:
-        """Single chunk keyed at `(0,)`."""
-        return {(0,): raw}
-
-    def pack_chunks(
-        self,
-        chunk_dict: dict[tuple[int, ...], Buffer | None],
-        chunk_spec: ArraySpec,
-    ) -> Buffer | None:
-        """Return the single chunk's bytes."""
-        return chunk_dict.get((0,))
-
-    def prepare_read_sync(
-        self,
-        byte_getter: Any,
-        chunk_selection: SelectorTuple,
-        codec_chain: SupportsChunkCodec,
-    ) -> NDBuffer | None:
-        """Fetch, decode, and return the selected region synchronously."""
-        raw = byte_getter.get_sync(prototype=codec_chain.array_spec.prototype)
-        if raw is None:
-            return None
-        chunk_array = codec_chain.decode_chunk(raw)
-        return chunk_array[chunk_selection]
-
-    def prepare_write_sync(
-        self,
-        byte_setter: Any,
-        codec_chain: SupportsChunkCodec,
-        chunk_selection: SelectorTuple,
-        out_selection: SelectorTuple,
-        replace: bool,
-    ) -> PreparedWrite:
-        """Fetch existing data if needed, unpack, return `PreparedWrite`."""
-        from zarr.core.indexing import ChunkProjection
-
-        existing: Buffer | None = None
-        if not replace:
-            existing = byte_setter.get_sync(prototype=codec_chain.array_spec.prototype)
-        chunk_dict = self.unpack_chunks(existing, codec_chain.array_spec)
-        indexer = [ChunkProjection((0,), chunk_selection, out_selection, replace)]  # type: ignore[arg-type]
-        return PreparedWrite(chunk_dict=chunk_dict, indexer=indexer)
-
-    def finalize_write_sync(
-        self,
-        prepared: PreparedWrite,
-        chunk_spec: ArraySpec,
-        byte_setter: Any,
-    ) -> None:
-        """Pack and write to store, or delete if empty."""
-        blob = self.pack_chunks(prepared.chunk_dict, chunk_spec)
-        if blob is None:
-            byte_setter.delete_sync()
-        else:
-            byte_setter.set_sync(blob)
-
-    async def prepare_read(
-        self,
-        byte_getter: Any,
-        chunk_selection: SelectorTuple,
-        codec_chain: SupportsChunkCodec,
-    ) -> NDBuffer | None:
-        """Async variant of `prepare_read_sync`."""
-        raw = await byte_getter.get(prototype=codec_chain.array_spec.prototype)
-        if raw is None:
-            return None
-        chunk_array = codec_chain.decode_chunk(raw)
-        return chunk_array[chunk_selection]
-
-    async def prepare_write(
-        self,
-        byte_setter: Any,
-        codec_chain: SupportsChunkCodec,
-        chunk_selection: SelectorTuple,
-        out_selection: SelectorTuple,
-        replace: bool,
-    ) -> PreparedWrite:
-        """Async variant of `prepare_write_sync`."""
-        from zarr.core.indexing import ChunkProjection
-
-        existing: Buffer | None = None
-        if not replace:
-            existing = await byte_setter.get(prototype=codec_chain.array_spec.prototype)
-        chunk_dict = self.unpack_chunks(existing, codec_chain.array_spec)
-        indexer = [ChunkProjection((0,), chunk_selection, out_selection, replace)]  # type: ignore[arg-type]
-        return PreparedWrite(chunk_dict=chunk_dict, indexer=indexer)
-
-    async def finalize_write(
-        self,
-        prepared: PreparedWrite,
-        chunk_spec: ArraySpec,
-        byte_setter: Any,
-    ) -> None:
-        """Async variant of `finalize_write_sync`."""
-        blob = self.pack_chunks(prepared.chunk_dict, chunk_spec)
-        if blob is None:
-            await byte_setter.delete()
-        else:
-            await byte_setter.set(blob)
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index d52d27afd6..765cd2728b 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -229,29 +229,23 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None
 
     if isinstance(metadata, ArrayV3Metadata):
         pipeline = get_pipeline_class().from_codecs(metadata.codecs)
-        # PhasedCodecPipeline needs evolve_from_array_spec to build its
-        # ChunkTransform and ShardLayout. BatchedCodecPipeline does not.
-        if hasattr(pipeline, "chunk_transform") and pipeline.chunk_transform is None:
-            from zarr.core.metadata.v3 import RegularChunkGridMetadata
-
-            # Use the regular chunk shape if available, otherwise use a
-            # placeholder shape. The ChunkTransform is shape-agnostic —
-            # the actual chunk shape is passed per-call at decode/encode time.
-            if isinstance(metadata.chunk_grid, RegularChunkGridMetadata):
-                chunk_shape = metadata.chunk_grid.chunk_shape
-            else:
-                # Rectilinear: use a 1-element shape per dimension as placeholder.
-                # Only dtype/fill_value/config matter for codec evolution.
-                chunk_shape = (1,) * len(metadata.shape)
-            chunk_spec = ArraySpec(
-                shape=chunk_shape,
-                dtype=metadata.data_type,
-                fill_value=metadata.fill_value,
-                config=ArrayConfig.from_dict({}),
-                prototype=default_buffer_prototype(),
-            )
-            pipeline = pipeline.evolve_from_array_spec(chunk_spec)
-        return pipeline
+        from zarr.core.metadata.v3 import RegularChunkGridMetadata
+
+        # Use the regular chunk shape if available, otherwise use a
+        # placeholder. The ChunkTransform is shape-agnostic — the actual
+        # chunk shape is passed per-call at decode/encode time.
+        if isinstance(metadata.chunk_grid, RegularChunkGridMetadata):
+            chunk_shape = metadata.chunk_grid.chunk_shape
+        else:
+            chunk_shape = (1,) * len(metadata.shape)
+        chunk_spec = ArraySpec(
+            shape=chunk_shape,
+            dtype=metadata.data_type,
+            fill_value=metadata.fill_value,
+            config=ArrayConfig.from_dict({}),
+            prototype=default_buffer_prototype(),
+        )
+        return pipeline.evolve_from_array_spec(chunk_spec)
     elif isinstance(metadata, ArrayV2Metadata):
         v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
         return get_pipeline_class().from_codecs([v2_codec])
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 57266df75c..738c2a1d66 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -656,11 +656,13 @@ def codecs_from_list(
 ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]:
     from zarr.codecs.sharding import ShardingCodec
 
+    codecs = tuple(codecs)  # materialize to avoid generator consumption issues
+
     array_array: tuple[ArrayArrayCodec, ...] = ()
     array_bytes_maybe: ArrayBytesCodec | None = None
     bytes_bytes: tuple[BytesBytesCodec, ...] = ()
 
-    if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1:
+    if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1:
         warn(
             "Combining a `sharding_indexed` codec disables partial reads and "
             "writes, which may lead to inefficient performance.",

From 0effe4d49e170096709944ffe57d83cdcf1d16a1 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 17 Apr 2026 22:51:21 +0200
Subject: [PATCH 06/22] feat: SupportsSetRange protocol + sync byte-range
 writes

Adds a SupportsSetRange protocol to zarr.abc.store for stores that
allow overwriting a byte range within an existing value. Implementations
are added for LocalStore (using file-handle seek+write) and MemoryStore
(in-memory bytearray slice assignment).

This is the prerequisite for the partial-shard write fast path in
ShardingCodec, which can patch individual inner-chunk slots without
rewriting the entire shard blob when the inner codec chain is fixed-size.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changes/3907.feature.md         |  1 +
 src/zarr/abc/store.py           | 18 ++++++++++++
 src/zarr/storage/_local.py      | 23 ++++++++++++++-
 src/zarr/storage/_memory.py     | 24 ++++++++++++++--
 tests/test_store/test_local.py  | 49 ++++++++++++++++++++++++++++++++
 tests/test_store/test_memory.py | 50 +++++++++++++++++++++++++++++++++
 6 files changed, 162 insertions(+), 3 deletions(-)
 create mode 100644 changes/3907.feature.md

diff --git a/changes/3907.feature.md b/changes/3907.feature.md
new file mode 100644
index 0000000000..66b908d305
--- /dev/null
+++ b/changes/3907.feature.md
@@ -0,0 +1 @@
+Add protocols for stores that support byte-range-writes. This is necessary to support in-place writes of sharded arrays.
\ No newline at end of file
diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py
index 600df17ee5..c33651f016 100644
--- a/src/zarr/abc/store.py
+++ b/src/zarr/abc/store.py
@@ -22,6 +22,7 @@
     "Store",
     "SupportsDeleteSync",
     "SupportsGetSync",
+    "SupportsSetRange",
     "SupportsSetSync",
     "SupportsSyncStore",
     "set_or_delete",
@@ -709,6 +710,23 @@ async def delete(self) -> None: ...
     async def set_if_not_exists(self, default: Buffer) -> None: ...
 
 
+@runtime_checkable
+class SupportsSetRange(Protocol):
+    """Protocol for stores that support writing to a byte range within an existing value.
+
+    Overwrites ``len(value)`` bytes starting at byte offset ``start`` within the
+    existing stored value for ``key``. The key must already exist and the write
+    must fit within the existing value (i.e., ``start + len(value) <= len(existing)``).
+
+    Behavior when the write extends past the end of the existing value is
+    implementation-specific and should not be relied upon.
+    """
+
+    async def set_range(self, key: str, value: Buffer, start: int) -> None: ...
+
+    def set_range_sync(self, key: str, value: Buffer, start: int) -> None: ...
+
+
 @runtime_checkable
 class SupportsGetSync(Protocol):
     def get_sync(
diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py
index 96f1e61746..a0eda303e1 100644
--- a/src/zarr/storage/_local.py
+++ b/src/zarr/storage/_local.py
@@ -16,6 +16,7 @@
     RangeByteRequest,
     Store,
     SuffixByteRequest,
+    SupportsSetRange,
 )
 from zarr.core.buffer import Buffer
 from zarr.core.buffer.core import default_buffer_prototype
@@ -77,6 +78,13 @@ def _atomic_write(
         raise
 
 
+def _put_range(path: Path, value: Buffer, start: int) -> None:
+    """Write bytes at a specific offset within an existing file."""
+    with path.open("r+b") as f:
+        f.seek(start)
+        f.write(value.as_numpy_array().tobytes())
+
+
 def _put(path: Path, value: Buffer, exclusive: bool = False) -> int:
     path.parent.mkdir(parents=True, exist_ok=True)
     # write takes any object supporting the buffer protocol
@@ -85,7 +93,7 @@ def _put(path: Path, value: Buffer, exclusive: bool = False) -> int:
         return f.write(view)
 
 
-class LocalStore(Store):
+class LocalStore(Store, SupportsSetRange):
     """
     Store for the local file system.
 
@@ -292,6 +300,19 @@ async def _set(self, key: str, value: Buffer, exclusive: bool = False) -> None:
         path = self.root / key
         await asyncio.to_thread(_put, path, value, exclusive=exclusive)
 
+    async def set_range(self, key: str, value: Buffer, start: int) -> None:
+        if not self._is_open:
+            await self._open()
+        self._check_writable()
+        path = self.root / key
+        await asyncio.to_thread(_put_range, path, value, start)
+
+    def set_range_sync(self, key: str, value: Buffer, start: int) -> None:
+        self._ensure_open_sync()
+        self._check_writable()
+        path = self.root / key
+        _put_range(path, value, start)
+
     async def delete(self, key: str) -> None:
         """
         Remove a key from the store.
diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py
index 1194894b9d..cb773ae30a 100644
--- a/src/zarr/storage/_memory.py
+++ b/src/zarr/storage/_memory.py
@@ -3,7 +3,7 @@
 from logging import getLogger
 from typing import TYPE_CHECKING, Any, Self
 
-from zarr.abc.store import ByteRequest, Store
+from zarr.abc.store import ByteRequest, Store, SupportsSetRange
 from zarr.core.buffer import Buffer, gpu
 from zarr.core.buffer.core import default_buffer_prototype
 from zarr.core.common import concurrent_map
@@ -18,7 +18,7 @@
 logger = getLogger(__name__)
 
 
-class MemoryStore(Store):
+class MemoryStore(Store, SupportsSetRange):
     """
     Store for local memory.
 
@@ -186,6 +186,26 @@ async def delete(self, key: str) -> None:
         except KeyError:
             logger.debug("Key %s does not exist.", key)
 
+    def _set_range_impl(self, key: str, value: Buffer, start: int) -> None:
+        buf = self._store_dict[key]
+        target = buf.as_numpy_array()
+        if not target.flags.writeable:
+            target = target.copy()
+            self._store_dict[key] = buf.__class__(target)
+        source = value.as_numpy_array()
+        target[start : start + len(source)] = source
+
+    async def set_range(self, key: str, value: Buffer, start: int) -> None:
+        self._check_writable()
+        await self._ensure_open()
+        self._set_range_impl(key, value, start)
+
+    def set_range_sync(self, key: str, value: Buffer, start: int) -> None:
+        self._check_writable()
+        if not self._is_open:
+            self._is_open = True
+        self._set_range_impl(key, value, start)
+
     async def list(self) -> AsyncIterator[str]:
         # docstring inherited
         for key in self._store_dict:
diff --git a/tests/test_store/test_local.py b/tests/test_store/test_local.py
index bdc9b48121..0712cd1bca 100644
--- a/tests/test_store/test_local.py
+++ b/tests/test_store/test_local.py
@@ -10,6 +10,7 @@
 
 import zarr
 from zarr import create_array
+from zarr.abc.store import SupportsSetRange
 from zarr.core.buffer import Buffer, cpu
 from zarr.core.sync import sync
 from zarr.storage import LocalStore
@@ -162,6 +163,54 @@ def test_get_json_sync_with_prototype_none(
         result = store._get_json_sync(key, prototype=buffer_cls)
         assert result == data
 
+    def test_supports_set_range(self, store: LocalStore) -> None:
+        """LocalStore should implement SupportsSetRange."""
+        assert isinstance(store, SupportsSetRange)
+
+    @pytest.mark.parametrize(
+        ("start", "patch", "expected"),
+        [
+            (0, b"XX", b"XXAAAAAAAA"),
+            (3, b"XX", b"AAAXXAAAAA"),
+            (8, b"XX", b"AAAAAAAAXX"),
+            (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+            (5, b"B", b"AAAAABAAAA"),
+            (0, b"BCDE", b"BCDEAAAAAA"),
+        ],
+        ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    )
+    async def test_set_range(
+        self, store: LocalStore, start: int, patch: bytes, expected: bytes
+    ) -> None:
+        """set_range should overwrite bytes at the given offset."""
+        await store.set("test/key", cpu.Buffer.from_bytes(b"AAAAAAAAAA"))
+        await store.set_range("test/key", cpu.Buffer.from_bytes(patch), start=start)
+        result = await store.get("test/key", prototype=cpu.buffer_prototype)
+        assert result is not None
+        assert result.to_bytes() == expected
+
+    @pytest.mark.parametrize(
+        ("start", "patch", "expected"),
+        [
+            (0, b"XX", b"XXAAAAAAAA"),
+            (3, b"XX", b"AAAXXAAAAA"),
+            (8, b"XX", b"AAAAAAAAXX"),
+            (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+            (5, b"B", b"AAAAABAAAA"),
+            (0, b"BCDE", b"BCDEAAAAAA"),
+        ],
+        ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    )
+    def test_set_range_sync(
+        self, store: LocalStore, start: int, patch: bytes, expected: bytes
+    ) -> None:
+        """set_range_sync should overwrite bytes at the given offset."""
+        sync(store.set("test/key", cpu.Buffer.from_bytes(b"AAAAAAAAAA")))
+        store.set_range_sync("test/key", cpu.Buffer.from_bytes(patch), start=start)
+        result = store.get_sync(key="test/key", prototype=cpu.buffer_prototype)
+        assert result is not None
+        assert result.to_bytes() == expected
+
 
 @pytest.mark.parametrize("exclusive", [True, False])
 def test_atomic_write_successful(tmp_path: pathlib.Path, exclusive: bool) -> None:
diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py
index 03c8b24271..d2554b411f 100644
--- a/tests/test_store/test_memory.py
+++ b/tests/test_store/test_memory.py
@@ -9,6 +9,7 @@
 import pytest
 
 import zarr
+from zarr.abc.store import SupportsSetRange
 from zarr.core.buffer import Buffer, cpu, gpu
 from zarr.core.sync import sync
 from zarr.errors import ZarrUserWarning
@@ -127,6 +128,55 @@ def test_get_json_sync_with_prototype_none(
         result = store._get_json_sync(key, prototype=buffer_cls)
         assert result == data
 
+    def test_supports_set_range(self, store: MemoryStore) -> None:
+        """MemoryStore should implement SupportsSetRange."""
+        assert isinstance(store, SupportsSetRange)
+
+    @pytest.mark.parametrize(
+        ("start", "patch", "expected"),
+        [
+            (0, b"XX", b"XXAAAAAAAA"),
+            (3, b"XX", b"AAAXXAAAAA"),
+            (8, b"XX", b"AAAAAAAAXX"),
+            (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+            (5, b"B", b"AAAAABAAAA"),
+            (0, b"BCDE", b"BCDEAAAAAA"),
+        ],
+        ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    )
+    async def test_set_range(
+        self, store: MemoryStore, start: int, patch: bytes, expected: bytes
+    ) -> None:
+        """set_range should overwrite bytes at the given offset."""
+        await store.set("test/key", cpu.Buffer.from_bytes(b"AAAAAAAAAA"))
+        await store.set_range("test/key", cpu.Buffer.from_bytes(patch), start=start)
+        result = await store.get("test/key", prototype=cpu.buffer_prototype)
+        assert result is not None
+        assert result.to_bytes() == expected
+
+    @pytest.mark.parametrize(
+        ("start", "patch", "expected"),
+        [
+            (0, b"XX", b"XXAAAAAAAA"),
+            (3, b"XX", b"AAAXXAAAAA"),
+            (8, b"XX", b"AAAAAAAAXX"),
+            (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+            (5, b"B", b"AAAAABAAAA"),
+            (0, b"BCDE", b"BCDEAAAAAA"),
+        ],
+        ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    )
+    def test_set_range_sync(
+        self, store: MemoryStore, start: int, patch: bytes, expected: bytes
+    ) -> None:
+        """set_range_sync should overwrite bytes at the given offset."""
+        store._is_open = True
+        store._store_dict["test/key"] = cpu.Buffer.from_bytes(b"AAAAAAAAAA")
+        store.set_range_sync("test/key", cpu.Buffer.from_bytes(patch), start=start)
+        result = store.get_sync(key="test/key", prototype=cpu.buffer_prototype)
+        assert result is not None
+        assert result.to_bytes() == expected
+
 
 # TODO: fix this warning
 @pytest.mark.filterwarnings("ignore:Unclosed client session:ResourceWarning")

From 7f45aba942c380e84a9653159f0568cf20538d2f Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 17 Apr 2026 22:51:36 +0200
Subject: [PATCH 07/22] feat: add sync codec methods to V2 and numcodecs codecs

V2Codec, BytesCodec, BloscCodec, etc. previously only implemented the
async _decode_single / _encode_single methods. Add their sync
counterparts (_decode_sync / _encode_sync) so that the upcoming
SyncCodecPipeline can dispatch through them without spinning up an
event loop.

For codecs that wrap external compressors (numcodecs.Zstd, numcodecs.Blosc,
the V2 fallback chain), the sync versions just call the underlying
compressor's blocking API directly instead of routing through
asyncio.to_thread.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/codecs/_v2.py               | 37 ++++++++++----------
 src/zarr/codecs/numcodecs/_codecs.py | 50 +++++++++++++++++-----------
 2 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
index 3c6c99c21c..bb34e31b8a 100644
--- a/src/zarr/codecs/_v2.py
+++ b/src/zarr/codecs/_v2.py
@@ -23,7 +23,7 @@ class V2Codec(ArrayBytesCodec):
 
     is_fixed_size = False
 
-    async def _decode_single(
+    def _decode_sync(
         self,
         chunk_bytes: Buffer,
         chunk_spec: ArraySpec,
@@ -31,14 +31,14 @@ async def _decode_single(
         cdata = chunk_bytes.as_array_like()
         # decompress
         if self.compressor:
-            chunk = await asyncio.to_thread(self.compressor.decode, cdata)
+            chunk = self.compressor.decode(cdata)
         else:
             chunk = cdata
 
         # apply filters
         if self.filters:
             for f in reversed(self.filters):
-                chunk = await asyncio.to_thread(f.decode, chunk)
+                chunk = f.decode(chunk)
 
         # view as numpy array with correct dtype
         chunk = ensure_ndarray_like(chunk)
@@ -48,20 +48,9 @@ async def _decode_single(
             try:
                 chunk = chunk.view(chunk_spec.dtype.to_native_dtype())
             except TypeError:
-                # this will happen if the dtype of the chunk
-                # does not match the dtype of the array spec i.g. if
-                # the dtype of the chunk_spec is a string dtype, but the chunk
-                # is an object array. In this case, we need to convert the object
-                # array to the correct dtype.
-
                 chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype())
 
         elif chunk.dtype != object:
-            # If we end up here, someone must have hacked around with the filters.
-            # We cannot deal with object arrays unless there is an object
-            # codec in the filter chain, i.e., a filter that converts from object
-            # array to something else during encoding, and converts back to object
-            # array during decoding.
             raise RuntimeError("cannot read object array without object codec")
 
         # ensure correct chunk shape
@@ -70,7 +59,7 @@ async def _decode_single(
 
         return get_ndbuffer_class().from_ndarray_like(chunk)
 
-    async def _encode_single(
+    def _encode_sync(
         self,
         chunk_array: NDBuffer,
         chunk_spec: ArraySpec,
@@ -83,18 +72,32 @@ async def _encode_single(
         # apply filters
         if self.filters:
             for f in self.filters:
-                chunk = await asyncio.to_thread(f.encode, chunk)
+                chunk = f.encode(chunk)
         # check object encoding
         if ensure_ndarray_like(chunk).dtype == object:
             raise RuntimeError("cannot write object array without object codec")
 
         # compress
         if self.compressor:
-            cdata = await asyncio.to_thread(self.compressor.encode, chunk)
+            cdata = self.compressor.encode(chunk)
         else:
             cdata = chunk
         cdata = ensure_bytes(cdata)
         return chunk_spec.prototype.buffer.from_bytes(cdata)
 
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_bytes, chunk_spec)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        return await asyncio.to_thread(self._encode_sync, chunk_array, chunk_spec)
+
     def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         raise NotImplementedError
diff --git a/src/zarr/codecs/numcodecs/_codecs.py b/src/zarr/codecs/numcodecs/_codecs.py
index 06c085ad2a..2b831661e8 100644
--- a/src/zarr/codecs/numcodecs/_codecs.py
+++ b/src/zarr/codecs/numcodecs/_codecs.py
@@ -45,7 +45,7 @@
 if TYPE_CHECKING:
     from zarr.abc.numcodec import Numcodec
     from zarr.core.array_spec import ArraySpec
-    from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer
+    from zarr.core.buffer import Buffer, NDBuffer
 
 CODEC_PREFIX = "numcodecs."
 
@@ -132,53 +132,63 @@ class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec):
     def __init__(self, **codec_config: JSON) -> None:
         super().__init__(**codec_config)
 
-    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
-        return await asyncio.to_thread(
-            as_numpy_array_wrapper,
-            self._codec.decode,
-            chunk_data,
-            chunk_spec.prototype,
-        )
+    def _decode_sync(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
+        return as_numpy_array_wrapper(self._codec.decode, chunk_data, chunk_spec.prototype)
 
-    def _encode(self, chunk_data: Buffer, prototype: BufferPrototype) -> Buffer:
+    def _encode_sync(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
         encoded = self._codec.encode(chunk_data.as_array_like())
         if isinstance(encoded, np.ndarray):  # Required for checksum codecs
-            return prototype.buffer.from_bytes(encoded.tobytes())
-        return prototype.buffer.from_bytes(encoded)
+            return chunk_spec.prototype.buffer.from_bytes(encoded.tobytes())
+        return chunk_spec.prototype.buffer.from_bytes(encoded)
+
+    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_data, chunk_spec)
 
     async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
-        return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype)
+        return await asyncio.to_thread(self._encode_sync, chunk_data, chunk_spec)
 
 
 class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec):
     def __init__(self, **codec_config: JSON) -> None:
         super().__init__(**codec_config)
 
-    async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+    def _decode_sync(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
         chunk_ndarray = chunk_data.as_ndarray_like()
-        out = await asyncio.to_thread(self._codec.decode, chunk_ndarray)
+        out = self._codec.decode(chunk_ndarray)
         return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
 
-    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+    def _encode_sync(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
         chunk_ndarray = chunk_data.as_ndarray_like()
-        out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
+        out = self._codec.encode(chunk_ndarray)
         return chunk_spec.prototype.nd_buffer.from_ndarray_like(out)
 
+    async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_data, chunk_spec)
+
+    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+        return await asyncio.to_thread(self._encode_sync, chunk_data, chunk_spec)
+
 
 class _NumcodecsArrayBytesCodec(_NumcodecsCodec, ArrayBytesCodec):
     def __init__(self, **codec_config: JSON) -> None:
         super().__init__(**codec_config)
 
-    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
+    def _decode_sync(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
         chunk_bytes = chunk_data.to_bytes()
-        out = await asyncio.to_thread(self._codec.decode, chunk_bytes)
+        out = self._codec.decode(chunk_bytes)
         return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
 
-    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
+    def _encode_sync(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
         chunk_ndarray = chunk_data.as_ndarray_like()
-        out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
+        out = self._codec.encode(chunk_ndarray)
         return chunk_spec.prototype.buffer.from_bytes(out)
 
+    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_data, chunk_spec)
+
+    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
+        return await asyncio.to_thread(self._encode_sync, chunk_data, chunk_spec)
+
 
 # bytes-to-bytes codecs
 class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"):

From 9b26f90da0c076e549f91a42432b1e8bd3f3d9cc Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 17 Apr 2026 22:53:31 +0200
Subject: [PATCH 08/22] =?UTF-8?q?feat:=20SyncCodecPipeline=20=E2=80=94=20s?=
 =?UTF-8?q?ynchronous=20codec=20pipeline=20with=20per-chunk=20parallelism?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds SyncCodecPipeline alongside BatchedCodecPipeline. The new pipeline
runs codecs through their sync entry points (_decode_sync / _encode_sync)
and dispatches per-chunk work to a module-level thread pool sized by
the codec_pipeline.max_workers config (default = os.cpu_count()).

Each chunk's full lifecycle (fetch + decode + scatter for reads;
get-existing + merge + encode + set/delete for writes) runs as one
pool task — overlapping IO of one chunk with compute of another.
Scatter into the shared output buffer is thread-safe because chunks
have non-overlapping output selections.

The async wrappers (read/write) detect SupportsGetSync/SupportsSetSync
stores and dispatch to the sync fast path, passing the configured
max_workers. Other stores fall through to the async path, which still
uses asyncio.concurrent_map at async.concurrency.

Notes on perf:
- Default (None → cpu_count) is tuned for chunks ≥ ~512 KB.
- Small chunks (≤ 64 KB) regress 1.5-3x because pool dispatch overhead
  (~30-50 µs/task) dominates per-chunk work. Workaround:
  zarr.config.set({"codec_pipeline.max_workers": 1}).
- For large chunks on local/memory stores, IO+compute parallelism
  yields 1.7-2.5x over BatchedCodecPipeline on direct-API reads and
  ~2.5x on roundtrip.

ChunkTransform encapsulates the sync codec chain. It caches resolved
ArraySpecs across calls with the same chunk_spec — combined with the
constant-ArraySpec optimization in indexing, hot-path overhead is
minimized.

Includes test scaffolding for the new pipeline (test_sync_codec_pipeline)
and config plumbing for the max_workers key.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 changes/3908.misc.md              |   1 +
 src/zarr/core/array.py            |  72 +++-
 src/zarr/core/codec_pipeline.py   | 670 ++++++++++++++++++++++++++++--
 src/zarr/core/config.py           |   1 +
 tests/test_codec_pipeline.py      | 333 ++++++++++++++-
 tests/test_config.py              |   3 +-
 tests/test_sync_codec_pipeline.py |  19 +-
 7 files changed, 1036 insertions(+), 63 deletions(-)
 create mode 100644 changes/3908.misc.md

diff --git a/changes/3908.misc.md b/changes/3908.misc.md
new file mode 100644
index 0000000000..66717e8444
--- /dev/null
+++ b/changes/3908.misc.md
@@ -0,0 +1 @@
+Reuse a constant `ArraySpec` during indexing when possible.
\ No newline at end of file
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index f0cd5dd734..3e1f63dc80 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -228,10 +228,35 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None
             pass
 
     if isinstance(metadata, ArrayV3Metadata):
-        return get_pipeline_class().from_codecs(metadata.codecs)
+        pipeline = get_pipeline_class().from_codecs(metadata.codecs)
+        from zarr.core.metadata.v3 import RegularChunkGridMetadata
+
+        # Use the regular chunk shape if available, otherwise use a
+        # placeholder. The ChunkTransform is shape-agnostic — the actual
+        # chunk shape is passed per-call at decode/encode time.
+        if isinstance(metadata.chunk_grid, RegularChunkGridMetadata):
+            chunk_shape = metadata.chunk_grid.chunk_shape
+        else:
+            chunk_shape = (1,) * len(metadata.shape)
+        chunk_spec = ArraySpec(
+            shape=chunk_shape,
+            dtype=metadata.data_type,
+            fill_value=metadata.fill_value,
+            config=ArrayConfig.from_dict({}),
+            prototype=default_buffer_prototype(),
+        )
+        return pipeline.evolve_from_array_spec(chunk_spec)
     elif isinstance(metadata, ArrayV2Metadata):
         v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
-        return get_pipeline_class().from_codecs([v2_codec])
+        pipeline = get_pipeline_class().from_codecs([v2_codec])
+        chunk_spec = ArraySpec(
+            shape=metadata.chunks,
+            dtype=metadata.dtype,
+            fill_value=metadata.fill_value,
+            config=ArrayConfig.from_dict({"order": metadata.order}),
+            prototype=default_buffer_prototype(),
+        )
+        return pipeline.evolve_from_array_spec(chunk_spec)
     raise TypeError  # pragma: no cover
 
 
@@ -5366,6 +5391,37 @@ def _get_chunk_spec(
     )
 
 
+def _get_default_chunk_spec(
+    metadata: ArrayMetadata,
+    chunk_grid: ChunkGrid,
+    array_config: ArrayConfig,
+    prototype: BufferPrototype,
+) -> ArraySpec | None:
+    """Build an ArraySpec for the regular (non-edge) chunk shape, or None if not regular.
+
+    For regular grids, all chunks have the same codec_shape, so we can
+    build the ArraySpec once and reuse it for every chunk — avoiding the
+    per-chunk ChunkGrid.__getitem__ + ArraySpec construction overhead.
+
+    .. note::
+        Ideally the per-chunk ArraySpec would not exist at all: dtype,
+        fill_value, config, and prototype are constant across chunks —
+        only the shape varies (and only for edge chunks). A cleaner
+        design would pass a single ArraySpec plus a per-chunk shape
+        override, which ChunkTransform.decode_chunk already supports
+        via its ``chunk_shape`` parameter.
+    """
+    if chunk_grid.is_regular:
+        return ArraySpec(
+            shape=chunk_grid.chunk_shape,
+            dtype=metadata.dtype,
+            fill_value=metadata.fill_value,
+            config=array_config,
+            prototype=prototype,
+        )
+    return None
+
+
 async def _get_selection(
     store_path: StorePath,
     metadata: ArrayMetadata,
@@ -5445,11 +5501,16 @@ async def _get_selection(
 
         # reading chunks and decoding them
         indexed_chunks = list(indexer)
+        # Pre-compute the default chunk spec for regular grids to avoid
+        # per-chunk ChunkGrid lookups and ArraySpec construction.
+        default_spec = _get_default_chunk_spec(metadata, chunk_grid, _config, prototype)
         results = await codec_pipeline.read(
             [
                 (
                     store_path / metadata.encode_chunk_key(chunk_coords),
-                    _get_chunk_spec(metadata, chunk_grid, chunk_coords, _config, prototype),
+                    default_spec
+                    if default_spec is not None
+                    else _get_chunk_spec(metadata, chunk_grid, chunk_coords, _config, prototype),
                     chunk_selection,
                     out_selection,
                     is_complete_chunk,
@@ -5788,11 +5849,14 @@ async def _set_selection(
         _config = replace(_config, order=order)
 
     # merging with existing data and encoding chunks
+    default_spec = _get_default_chunk_spec(metadata, chunk_grid, _config, prototype)
     await codec_pipeline.write(
         [
             (
                 store_path / metadata.encode_chunk_key(chunk_coords),
-                _get_chunk_spec(metadata, chunk_grid, chunk_coords, _config, prototype),
+                default_spec
+                if default_spec is not None
+                else _get_chunk_spec(metadata, chunk_grid, chunk_coords, _config, prototype),
                 chunk_selection,
                 out_selection,
                 is_complete_chunk,
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 4cecc3a6d1..89e606c652 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
+import threading
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from itertools import islice, pairwise
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 from warnings import warn
 
 from zarr.abc.codec import (
@@ -33,6 +35,64 @@
     from zarr.core.metadata.v3 import ChunkGridMetadata
 
 
+_pool: ThreadPoolExecutor | None = None
+_pool_size: int = 0
+_pool_lock = threading.Lock()
+
+
+def _resolve_max_workers() -> int:
+    """Resolve ``codec_pipeline.max_workers`` config to an effective worker count.
+
+    ``None`` means "auto" → ``os.cpu_count()`` (or 1 if unavailable).
+    Values < 1 are clamped to 1 (sequential).
+
+    Notes
+    -----
+    The default (``None`` → ``cpu_count``) is tuned for large chunks
+    (≳ 1 MB encoded) where per-chunk decode + scatter is real work and
+    threading helps. For small chunks (≲ 64 KB) the per-task pool
+    overhead (≈ 30-50 µs submit + worker handoff) outweighs the work
+    and threading slows things down by 1.5-3x. If your workload uses
+    many small chunks, set ``codec_pipeline.max_workers=1`` explicitly:
+
+        zarr.config.set({"codec_pipeline.max_workers": 1})
+
+    Approximate breakeven on uncompressed reads: 256-512 KB per chunk.
+    Compressed chunks shift the threshold lower because decode is real
+    CPU work that benefits from parallelism.
+    """
+    import os as _os
+
+    cfg = config.get("codec_pipeline.max_workers", default=None)
+    if cfg is None:
+        return _os.cpu_count() or 1
+    return max(1, int(cfg))
+
+
+def _get_pool(max_workers: int) -> ThreadPoolExecutor:
+    """Get or create the module-level thread pool, sized to ``max_workers``.
+
+    The pool grows on demand — if a request arrives for more workers than
+    the current pool has, the existing pool is shut down and replaced.
+    Shrinking requests reuse the existing larger pool (it just leaves
+    workers idle).
+
+    Callers that want sequential execution should not call this — they
+    should run the task list inline. ``max_workers`` must be >= 1.
+    """
+    global _pool, _pool_size
+    if max_workers < 1:
+        raise ValueError(f"max_workers must be >= 1, got {max_workers}")
+    if _pool is None or _pool_size < max_workers:
+        with _pool_lock:
+            if _pool is None or _pool_size < max_workers:
+                if _pool is not None:
+                    _pool.shutdown(wait=False)
+                _pool = ThreadPoolExecutor(max_workers=max_workers)
+                _pool_size = max_workers
+    return _pool
+
+
 def _unzip2[T, U](iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]:
     out0: list[T] = []
     out1: list[U] = []
@@ -69,24 +129,23 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
 
 @dataclass(slots=True, kw_only=True)
 class ChunkTransform:
-    """A synchronous codec chain bound to an ArraySpec.
+    """A synchronous codec chain.
 
-    Provides `encode` and `decode` for pure-compute codec operations
-    (no IO, no threading, no batching).
+    Provides `encode_chunk` and `decode_chunk` for pure-compute codec
+    operations (no IO, no threading, no batching). The `chunk_spec` is
+    supplied per call so the same transform can be reused across chunks
+    with different shapes, prototypes, etc.
 
     All codecs must implement `SupportsSyncCodec`. Construction will
     raise `TypeError` if any codec does not.
     """
 
     codecs: tuple[Codec, ...]
-    array_spec: ArraySpec
 
-    # (sync codec, input_spec) pairs in pipeline order.
-    _aa_codecs: tuple[tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ArraySpec], ...] = field(
+    _aa_codecs: tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ...] = field(
         init=False, repr=False, compare=False
     )
     _ab_codec: SupportsSyncCodec[NDBuffer, Buffer] = field(init=False, repr=False, compare=False)
-    _ab_spec: ArraySpec = field(init=False, repr=False, compare=False)
     _bb_codecs: tuple[SupportsSyncCodec[Buffer, Buffer], ...] = field(
         init=False, repr=False, compare=False
     )
@@ -100,65 +159,87 @@ def __post_init__(self) -> None:
             )
 
         aa, ab, bb = codecs_from_list(list(self.codecs))
+        # SupportsSyncCodec was verified above; the cast is purely for mypy.
+        self._aa_codecs = cast("tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ...]", tuple(aa))
+        self._ab_codec = cast("SupportsSyncCodec[NDBuffer, Buffer]", ab)
+        self._bb_codecs = cast("tuple[SupportsSyncCodec[Buffer, Buffer], ...]", tuple(bb))
 
-        aa_codecs: list[tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ArraySpec]] = []
-        spec = self.array_spec
-        for aa_codec in aa:
-            assert isinstance(aa_codec, SupportsSyncCodec)
-            aa_codecs.append((aa_codec, spec))
-            spec = aa_codec.resolve_metadata(spec)
-
-        self._aa_codecs = tuple(aa_codecs)
-        assert isinstance(ab, SupportsSyncCodec)
-        self._ab_codec = ab
-        self._ab_spec = spec
-        bb_sync: list[SupportsSyncCodec[Buffer, Buffer]] = []
-        for bb_codec in bb:
-            assert isinstance(bb_codec, SupportsSyncCodec)
-            bb_sync.append(bb_codec)
-        self._bb_codecs = tuple(bb_sync)
-
-    def decode(
-        self,
-        chunk_bytes: Buffer,
-    ) -> NDBuffer:
+    _cached_key: tuple[tuple[int, ...], int] | None = field(
+        init=False, repr=False, compare=False, default=None
+    )
+    _cached_aa_specs: tuple[ArraySpec, ...] | None = field(
+        init=False, repr=False, compare=False, default=None
+    )
+    _cached_ab_spec: ArraySpec | None = field(init=False, repr=False, compare=False, default=None)
+
+    def _resolve_specs(self, chunk_spec: ArraySpec) -> tuple[tuple[ArraySpec, ...], ArraySpec]:
+        """Return per-AA-codec input specs and the AB spec for ``chunk_spec``.
+
+        The codec chain only changes ``shape`` (via TransposeCodec etc.) —
+        ``prototype``, ``dtype``, ``fill_value``, and ``config`` are
+        invariant. We cache the resolved spec chain keyed on
+        ``(chunk_spec.shape, id(chunk_spec))``, and reuse it directly
+        when the same ``chunk_spec`` is passed again. For a different
+        ``chunk_spec`` with the same shape, we recompute (cheap).
+        """
+        if not self._aa_codecs:
+            return (), chunk_spec
+        key = (chunk_spec.shape, id(chunk_spec))
+        if self._cached_key == key:
+            assert self._cached_aa_specs is not None
+            assert self._cached_ab_spec is not None
+            return self._cached_aa_specs, self._cached_ab_spec
+
+        aa_specs: list[ArraySpec] = []
+        spec = chunk_spec
+        for aa_codec in self._aa_codecs:
+            aa_specs.append(spec)
+            spec = aa_codec.resolve_metadata(spec)  # type: ignore[attr-defined]
+        aa_specs_t = tuple(aa_specs)
+        self._cached_key = key
+        self._cached_aa_specs = aa_specs_t
+        self._cached_ab_spec = spec
+        return aa_specs_t, spec
+
+    def decode_chunk(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
         """Decode a single chunk through the full codec chain, synchronously.
 
         Pure compute -- no IO.
         """
+        aa_specs, ab_spec = self._resolve_specs(chunk_spec)
+
         data: Buffer = chunk_bytes
         for bb_codec in reversed(self._bb_codecs):
-            data = bb_codec._decode_sync(data, self._ab_spec)
+            data = bb_codec._decode_sync(data, ab_spec)
 
-        chunk_array: NDBuffer = self._ab_codec._decode_sync(data, self._ab_spec)
+        chunk_array: NDBuffer = self._ab_codec._decode_sync(data, ab_spec)
 
-        for aa_codec, spec in reversed(self._aa_codecs):
-            chunk_array = aa_codec._decode_sync(chunk_array, spec)
+        for aa_codec, aa_spec in zip(reversed(self._aa_codecs), reversed(aa_specs), strict=True):
+            chunk_array = aa_codec._decode_sync(chunk_array, aa_spec)
 
         return chunk_array
 
-    def encode(
-        self,
-        chunk_array: NDBuffer,
-    ) -> Buffer | None:
+    def encode_chunk(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> Buffer | None:
         """Encode a single chunk through the full codec chain, synchronously.
 
         Pure compute -- no IO.
         """
+        aa_specs, ab_spec = self._resolve_specs(chunk_spec)
+
         aa_data: NDBuffer = chunk_array
-        for aa_codec, spec in self._aa_codecs:
-            aa_result = aa_codec._encode_sync(aa_data, spec)
+        for aa_codec, aa_spec in zip(self._aa_codecs, aa_specs, strict=True):
+            aa_result = aa_codec._encode_sync(aa_data, aa_spec)
             if aa_result is None:
                 return None
             aa_data = aa_result
 
-        ab_result = self._ab_codec._encode_sync(aa_data, self._ab_spec)
+        ab_result = self._ab_codec._encode_sync(aa_data, ab_spec)
         if ab_result is None:
             return None
 
         bb_data: Buffer = ab_result
         for bb_codec in self._bb_codecs:
-            bb_result = bb_codec._encode_sync(bb_data, self._ab_spec)
+            bb_result = bb_codec._encode_sync(bb_data, ab_spec)
             if bb_result is None:
                 return None
             bb_data = bb_result
@@ -621,11 +702,13 @@ def codecs_from_list(
 ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]:
     from zarr.codecs.sharding import ShardingCodec
 
+    codecs = tuple(codecs)  # materialize to avoid generator consumption issues
+
     array_array: tuple[ArrayArrayCodec, ...] = ()
     array_bytes_maybe: ArrayBytesCodec | None = None
     bytes_bytes: tuple[BytesBytesCodec, ...] = ()
 
-    if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1:
+    if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1:
         warn(
             "Combining a `sharding_indexed` codec disables partial reads and "
             "writes, which may lead to inefficient performance.",
@@ -679,3 +762,506 @@ def codecs_from_list(
 
 
 register_pipeline(BatchedCodecPipeline)
+
+
+@dataclass(frozen=True)
+class SyncCodecPipeline(CodecPipeline):
+    """Codec pipeline that uses the codec chain directly.
+
+    Separates IO from compute without an intermediate layout abstraction.
+    The ShardingCodec handles shard IO internally via its ``_decode_sync``
+    and ``_encode_sync`` methods, so the pipeline simply:
+
+    1. Fetches the raw blob from the store (one key per chunk/shard).
+    2. Decodes/encodes through the codec chain (pure compute).
+    3. Writes the result back.
+
+    A ``ChunkTransform`` wraps the codec chain for fast synchronous
+    decode/encode when all codecs support ``SupportsSyncCodec``.
+    """
+
+    codecs: tuple[Codec, ...]
+    array_array_codecs: tuple[ArrayArrayCodec, ...]
+    array_bytes_codec: ArrayBytesCodec
+    bytes_bytes_codecs: tuple[BytesBytesCodec, ...]
+    _sync_transform: ChunkTransform | None
+    batch_size: int
+
+    @classmethod
+    def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
+        codec_list = tuple(codecs)
+        aa, ab, bb = codecs_from_list(codec_list)
+
+        if batch_size is None:
+            batch_size = config.get("codec_pipeline.batch_size")
+
+        return cls(
+            codecs=codec_list,
+            array_array_codecs=aa,
+            array_bytes_codec=ab,
+            bytes_bytes_codecs=bb,
+            _sync_transform=None,
+            batch_size=batch_size,
+        )
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=array_spec) for c in self.codecs)
+        aa, ab, bb = codecs_from_list(evolved_codecs)
+
+        try:
+            sync_transform: ChunkTransform | None = ChunkTransform(codecs=evolved_codecs)
+        except TypeError:
+            sync_transform = None
+
+        return type(self)(
+            codecs=evolved_codecs,
+            array_array_codecs=aa,
+            array_bytes_codec=ab,
+            bytes_bytes_codecs=bb,
+            _sync_transform=sync_transform,
+            batch_size=self.batch_size,
+        )
+
+    def __iter__(self) -> Iterator[Codec]:
+        return iter(self.codecs)
+
+    @property
+    def supports_partial_decode(self) -> bool:
+        return isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin)
+
+    @property
+    def supports_partial_encode(self) -> bool:
+        return isinstance(self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin)
+
+    def validate(
+        self,
+        *,
+        shape: tuple[int, ...],
+        dtype: ZDType[TBaseDType, TBaseScalar],
+        chunk_grid: ChunkGridMetadata,
+    ) -> None:
+        for codec in self.codecs:
+            codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid)
+
+    def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
+        for codec in self:
+            byte_length = codec.compute_encoded_size(byte_length, array_spec)
+            array_spec = codec.resolve_metadata(array_spec)
+        return byte_length
+
+    # -- async decode/encode (required by ABC) --
+
+    async def decode(
+        self,
+        chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
+    ) -> Iterable[NDBuffer | None]:
+        chunk_bytes_batch: Iterable[Buffer | None]
+        chunk_bytes_batch, chunk_specs = _unzip2(chunk_bytes_and_specs)
+
+        for bb_codec in self.bytes_bytes_codecs[::-1]:
+            chunk_bytes_batch = await bb_codec.decode(
+                zip(chunk_bytes_batch, chunk_specs, strict=False)
+            )
+        chunk_array_batch = await self.array_bytes_codec.decode(
+            zip(chunk_bytes_batch, chunk_specs, strict=False)
+        )
+        for aa_codec in self.array_array_codecs[::-1]:
+            chunk_array_batch = await aa_codec.decode(
+                zip(chunk_array_batch, chunk_specs, strict=False)
+            )
+        return chunk_array_batch
+
+    async def encode(
+        self,
+        chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]],
+    ) -> Iterable[Buffer | None]:
+        chunk_array_batch: Iterable[NDBuffer | None]
+        chunk_array_batch, chunk_specs = _unzip2(chunk_arrays_and_specs)
+
+        for aa_codec in self.array_array_codecs:
+            chunk_array_batch = await aa_codec.encode(
+                zip(chunk_array_batch, chunk_specs, strict=False)
+            )
+        chunk_bytes_batch = await self.array_bytes_codec.encode(
+            zip(chunk_array_batch, chunk_specs, strict=False)
+        )
+        for bb_codec in self.bytes_bytes_codecs:
+            chunk_bytes_batch = await bb_codec.encode(
+                zip(chunk_bytes_batch, chunk_specs, strict=False)
+            )
+        return chunk_bytes_batch
+
+    # -- merge helper --
+
+    @staticmethod
+    def _merge_chunk_array(
+        existing_chunk_array: NDBuffer | None,
+        value: NDBuffer,
+        out_selection: SelectorTuple,
+        chunk_spec: ArraySpec,
+        chunk_selection: SelectorTuple,
+        is_complete_chunk: bool,
+        drop_axes: tuple[int, ...],
+    ) -> NDBuffer:
+        if (
+            is_complete_chunk
+            and value.shape == chunk_spec.shape
+            and value[out_selection].shape == chunk_spec.shape
+        ):
+            return value
+        if existing_chunk_array is None:
+            chunk_array = chunk_spec.prototype.nd_buffer.create(
+                shape=chunk_spec.shape,
+                dtype=chunk_spec.dtype.to_native_dtype(),
+                order=chunk_spec.order,
+                fill_value=fill_value_or_default(chunk_spec),
+            )
+        else:
+            chunk_array = existing_chunk_array.copy()
+        if chunk_selection == () or is_scalar(
+            value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
+        ):
+            chunk_value = value
+        else:
+            chunk_value = value[out_selection]
+            if drop_axes:
+                item = tuple(
+                    None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim)
+                )
+                chunk_value = chunk_value[item]
+        chunk_array[chunk_selection] = chunk_value
+        return chunk_array
+
+    # -- sync read/write --
+
+    def read_sync(
+        self,
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        out: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+        max_workers: int = 1,
+    ) -> tuple[GetResult, ...]:
+        """Synchronous read: fetch -> decode -> scatter, per chunk.
+
+        When ``max_workers > 1`` and there are multiple chunks, each
+        chunk's full lifecycle (fetch + decode + scatter) runs as one
+        task on a thread pool sized to ``max_workers`` — overlapping IO
+        of one chunk with decode/scatter of another. Scatter is
+        thread-safe because the chunks have non-overlapping output
+        selections.
+
+        ``max_workers=1`` runs everything sequentially in the calling
+        thread (no pool involvement).
+
+        Mirrors ``BatchedCodecPipeline.read_batch``: when the AB codec
+        supports partial decoding (e.g. sharding), the codec handles its
+        own IO and only fetches the inner-chunk byte ranges that overlap
+        the read selection. Otherwise the pipeline fetches the full
+        blob and decodes the whole chunk.
+        """
+        assert self._sync_transform is not None
+        transform = self._sync_transform
+
+        batch = list(batch_info)
+        if not batch:
+            return ()
+
+        fill = fill_value_or_default(batch[0][1])
+        _missing = GetResult(status="missing")
+
+        # Partial-decode fast path: the AB codec owns IO (read only the
+        # byte ranges needed for the requested selection). Same condition
+        # and dispatch as BatchedCodecPipeline.read_batch.
+        if self.supports_partial_decode:
+            codec = self.array_bytes_codec
+            assert hasattr(codec, "_decode_partial_sync")
+
+            def _read_one_partial(
+                item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+            ) -> GetResult:
+                byte_getter, chunk_spec, chunk_selection, out_selection, _ = item
+                decoded = codec._decode_partial_sync(byte_getter, chunk_selection, chunk_spec)
+                if decoded is None:
+                    out[out_selection] = fill
+                    return _missing
+                if drop_axes:
+                    decoded = decoded.squeeze(axis=drop_axes)
+                out[out_selection] = decoded
+                return GetResult(status="present")
+
+            if max_workers > 1 and len(batch) > 1:
+                pool = _get_pool(max_workers)
+                return tuple(pool.map(_read_one_partial, batch))
+            return tuple(_read_one_partial(item) for item in batch)
+
+        # Per-chunk fused path: fetch + decode + scatter as one task.
+        def _read_one(
+            item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+        ) -> GetResult:
+            byte_getter, chunk_spec, chunk_selection, out_selection, _ = item
+            raw = byte_getter.get_sync(prototype=chunk_spec.prototype)
+            if raw is None:
+                out[out_selection] = fill
+                return _missing
+            decoded = transform.decode_chunk(raw, chunk_spec)
+            selected = decoded[chunk_selection]
+            if drop_axes:
+                selected = selected.squeeze(axis=drop_axes)
+            out[out_selection] = selected
+            return GetResult(status="present")
+
+        if max_workers > 1 and len(batch) > 1:
+            pool = _get_pool(max_workers)
+            return tuple(pool.map(_read_one, batch))
+        return tuple(_read_one(item) for item in batch)
+
+    def write_sync(
+        self,
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        value: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+        max_workers: int = 1,
+    ) -> None:
+        """Synchronous write: fetch existing -> merge+encode -> store.
+
+        When ``max_workers > 1`` and there are multiple chunks, each
+        chunk's full lifecycle (get-existing + merge + encode + set/delete)
+        runs as one task on a thread pool sized to ``max_workers`` —
+        overlapping IO of one chunk with compute of another.
+
+        ``max_workers=1`` runs everything sequentially in the calling
+        thread (no pool involvement).
+
+        When the codec pipeline supports partial encoding (e.g. a
+        sharding codec with no outer AA/BB codecs), the AB codec handles
+        the full write cycle — reading existing data, merging, encoding,
+        and writing — matching the async ``BatchedCodecPipeline`` path.
+        """
+        assert self._sync_transform is not None
+        transform = self._sync_transform
+
+        batch = list(batch_info)
+        if not batch:
+            return
+
+        # Partial-encode path: the AB codec owns IO (read, merge, encode,
+        # write).  Same condition and calling convention as
+        # BatchedCodecPipeline.write_batch.
+        if self.supports_partial_encode:
+            codec = self.array_bytes_codec
+            assert hasattr(codec, "_encode_partial_sync")
+            scalar = len(value.shape) == 0
+
+            def _encode_one_partial(
+                item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+            ) -> None:
+                bs, chunk_spec, chunk_selection, out_selection, _is_complete = item
+                chunk_value = value if scalar else value[out_selection]
+                codec._encode_partial_sync(bs, chunk_value, chunk_selection, chunk_spec)
+
+            if max_workers > 1 and len(batch) > 1:
+                pool = _get_pool(max_workers)
+                # consume the iterator to surface exceptions
+                list(pool.map(_encode_one_partial, batch))
+            else:
+                for item in batch:
+                    _encode_one_partial(item)
+            return
+
+        # Per-chunk fused path: get-existing + merge + encode + set/delete as one task.
+        def _write_one(
+            item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+        ) -> None:
+            bs, chunk_spec, chunk_selection, out_selection, is_complete = item
+            existing_bytes: Buffer | None = None
+            if not is_complete:
+                existing_bytes = bs.get_sync(prototype=chunk_spec.prototype)
+
+            existing_chunk_array: NDBuffer | None = None
+            if existing_bytes is not None:
+                existing_chunk_array = transform.decode_chunk(existing_bytes, chunk_spec)
+
+            chunk_array = self._merge_chunk_array(
+                existing_chunk_array,
+                value,
+                out_selection,
+                chunk_spec,
+                chunk_selection,
+                is_complete,
+                drop_axes,
+            )
+
+            if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
+                fill_value_or_default(chunk_spec)
+            ):
+                bs.delete_sync()
+                return
+
+            encoded = transform.encode_chunk(chunk_array, chunk_spec)
+            if encoded is None:
+                bs.delete_sync()
+            else:
+                bs.set_sync(encoded)
+
+        if max_workers > 1 and len(batch) > 1:
+            pool = _get_pool(max_workers)
+            list(pool.map(_write_one, batch))
+        else:
+            for item in batch:
+                _write_one(item)
+
+    # -- async read/write --
+
+    async def read(
+        self,
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        out: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+    ) -> tuple[GetResult, ...]:
+        batch = list(batch_info)
+        if not batch:
+            return ()
+
+        # Fast path: sync store with sync transform
+        from zarr.abc.store import SupportsGetSync
+        from zarr.storage._common import StorePath
+
+        first_bg = batch[0][0]
+        if (
+            self._sync_transform is not None
+            and isinstance(first_bg, StorePath)
+            and isinstance(first_bg.store, SupportsGetSync)
+        ):
+            return self.read_sync(batch, out, drop_axes, max_workers=_resolve_max_workers())
+
+        # Async fallback: fetch all chunks, decode via async codec API, scatter
+        chunk_bytes_batch = await concurrent_map(
+            [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch],
+            lambda byte_getter, prototype: byte_getter.get(prototype),
+            config.get("async.concurrency"),
+        )
+        chunk_array_batch = await self.decode(
+            [
+                (chunk_bytes, chunk_spec)
+                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+            ],
+        )
+        results: list[GetResult] = []
+        for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
+            chunk_array_batch, batch, strict=False
+        ):
+            if chunk_array is not None:
+                tmp = chunk_array[chunk_selection]
+                if drop_axes:
+                    tmp = tmp.squeeze(axis=drop_axes)
+                out[out_selection] = tmp
+                results.append(GetResult(status="present"))
+            else:
+                out[out_selection] = fill_value_or_default(chunk_spec)
+                results.append(GetResult(status="missing"))
+        return tuple(results)
+
+    async def write(
+        self,
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        value: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+    ) -> None:
+        batch = list(batch_info)
+        if not batch:
+            return
+
+        # Fast path: sync store with sync transform
+        from zarr.abc.store import SupportsSetSync
+        from zarr.storage._common import StorePath
+
+        first_bs = batch[0][0]
+        if (
+            self._sync_transform is not None
+            and isinstance(first_bs, StorePath)
+            and isinstance(first_bs.store, SupportsSetSync)
+        ):
+            self.write_sync(batch, value, drop_axes, max_workers=_resolve_max_workers())
+            return
+
+        # Async fallback: same pattern as BatchedCodecPipeline.write_batch
+        async def _read_key(
+            byte_setter: ByteSetter | None, prototype: BufferPrototype
+        ) -> Buffer | None:
+            if byte_setter is None:
+                return None
+            return await byte_setter.get(prototype=prototype)
+
+        chunk_bytes_batch: Iterable[Buffer | None]
+        chunk_bytes_batch = await concurrent_map(
+            [
+                (
+                    None if is_complete_chunk else byte_setter,
+                    chunk_spec.prototype,
+                )
+                for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch
+            ],
+            _read_key,
+            config.get("async.concurrency"),
+        )
+        chunk_array_decoded = await self.decode(
+            [
+                (chunk_bytes, chunk_spec)
+                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+            ],
+        )
+
+        chunk_array_merged = [
+            self._merge_chunk_array(
+                chunk_array,
+                value,
+                out_selection,
+                chunk_spec,
+                chunk_selection,
+                is_complete_chunk,
+                drop_axes,
+            )
+            for chunk_array, (
+                _,
+                chunk_spec,
+                chunk_selection,
+                out_selection,
+                is_complete_chunk,
+            ) in zip(chunk_array_decoded, batch, strict=False)
+        ]
+        chunk_array_batch: list[NDBuffer | None] = []
+        for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch, strict=False):
+            if chunk_array is None:
+                chunk_array_batch.append(None)  # type: ignore[unreachable]
+            else:
+                if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
+                    fill_value_or_default(chunk_spec)
+                ):
+                    chunk_array_batch.append(None)
+                else:
+                    chunk_array_batch.append(chunk_array)
+
+        chunk_bytes_batch = await self.encode(
+            [
+                (chunk_array, chunk_spec)
+                for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_batch, batch, strict=False)
+            ],
+        )
+
+        async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None:
+            if chunk_bytes is None:
+                await byte_setter.delete()
+            else:
+                await byte_setter.set(chunk_bytes)
+
+        await concurrent_map(
+            [
+                (byte_setter, chunk_bytes)
+                for chunk_bytes, (byte_setter, *_) in zip(chunk_bytes_batch, batch, strict=False)
+            ],
+            _write_key,
+            config.get("async.concurrency"),
+        )
+
+
+register_pipeline(SyncCodecPipeline)
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 7dcbc78e31..1d7060b7fb 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -106,6 +106,7 @@ def enable_gpu(self) -> ConfigSet:
             "codec_pipeline": {
                 "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
                 "batch_size": 1,
+                "max_workers": None,
             },
             "codecs": {
                 "blosc": "zarr.codecs.blosc.BloscCodec",
diff --git a/tests/test_codec_pipeline.py b/tests/test_codec_pipeline.py
index 48e15b0643..015a98c495 100644
--- a/tests/test_codec_pipeline.py
+++ b/tests/test_codec_pipeline.py
@@ -1,33 +1,63 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
 import pytest
 
 import zarr
 from zarr.core.array import _get_chunk_spec
 from zarr.core.buffer.core import default_buffer_prototype
+from zarr.core.config import config as zarr_config
 from zarr.core.indexing import BasicIndexer
+from zarr.errors import ChunkNotFoundError
 from zarr.storage import MemoryStore
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@pytest.fixture(autouse=True)
+def _enable_rectilinear_chunks() -> Generator[None]:
+    """Enable rectilinear chunks for all tests in this module."""
+    with zarr_config.set({"array.rectilinear_chunks": True}):
+        yield
+
+
+pipeline_paths = [
+    "zarr.core.codec_pipeline.BatchedCodecPipeline",
+    "zarr.core.codec_pipeline.SyncCodecPipeline",
+]
+
+
+@pytest.fixture(params=pipeline_paths, ids=["batched", "sync"])
+def pipeline_class(request: pytest.FixtureRequest) -> Generator[str]:
+    """Temporarily set the codec pipeline class for the test."""
+    path = request.param
+    with zarr_config.set({"codec_pipeline.path": path}):
+        yield path
+
+
+# ---------------------------------------------------------------------------
+# GetResult status tests (low-level pipeline API)
+# ---------------------------------------------------------------------------
+
 
 @pytest.mark.parametrize(
     ("write_slice", "read_slice", "expected_statuses"),
     [
-        # Write all chunks, read all — all present
         (slice(None), slice(None), ("present", "present", "present")),
-        # Write first chunk only, read all — first present, rest missing
         (slice(0, 2), slice(None), ("present", "missing", "missing")),
-        # Write nothing, read all — all missing
         (None, slice(None), ("missing", "missing", "missing")),
     ],
 )
 async def test_read_returns_get_results(
+    pipeline_class: str,
     write_slice: slice | None,
     read_slice: slice,
     expected_statuses: tuple[str, ...],
 ) -> None:
-    """
-    Test that CodecPipeline.read returns a tuple of GetResult with correct statuses.
-    """
+    """CodecPipeline.read returns GetResult with correct statuses."""
     store = MemoryStore()
     arr = zarr.open_array(store, mode="w", shape=(6,), chunks=(2,), dtype="int64", fill_value=-1)
 
@@ -70,3 +100,294 @@ async def test_read_returns_get_results(
     assert len(results) == len(expected_statuses)
     for result, expected_status in zip(results, expected_statuses, strict=True):
         assert result["status"] == expected_status
+
+
+# ---------------------------------------------------------------------------
+# End-to-end read/write tests
+# ---------------------------------------------------------------------------
+
+array_configs = [
+    pytest.param(
+        {"shape": (100,), "dtype": "float64", "chunks": (10,), "shards": None, "compressors": None},
+        id="1d-unsharded",
+    ),
+    pytest.param(
+        {
+            "shape": (100,),
+            "dtype": "float64",
+            "chunks": (10,),
+            "shards": (100,),
+            "compressors": None,
+        },
+        id="1d-sharded",
+    ),
+    pytest.param(
+        {
+            "shape": (10, 20),
+            "dtype": "int32",
+            "chunks": (5, 10),
+            "shards": None,
+            "compressors": None,
+        },
+        id="2d-unsharded",
+    ),
+    pytest.param(
+        {
+            "shape": (100,),
+            "dtype": "float64",
+            "chunks": (10,),
+            "shards": None,
+            "compressors": {"name": "gzip", "configuration": {"level": 1}},
+        },
+        id="1d-gzip",
+    ),
+    pytest.param(
+        {
+            "shape": (60, 100),
+            "dtype": "int32",
+            "chunks": [[10, 20, 30], [50, 50]],
+            "shards": None,
+            "compressors": None,
+        },
+        id="2d-rectilinear",
+    ),
+]
+
+
+@pytest.mark.parametrize("arr_kwargs", array_configs)
+async def test_roundtrip(pipeline_class: str, arr_kwargs: dict[str, Any]) -> None:
+    """Data survives a full write/read roundtrip."""
+    store = MemoryStore()
+    arr = zarr.create_array(store=store, fill_value=0, **arr_kwargs)
+    data = np.arange(int(np.prod(arr.shape)), dtype=arr.dtype).reshape(arr.shape)
+    arr[:] = data
+    np.testing.assert_array_equal(arr[:], data)
+
+
+@pytest.mark.parametrize("arr_kwargs", array_configs)
+async def test_missing_chunks_fill_value(pipeline_class: str, arr_kwargs: dict[str, Any]) -> None:
+    """Reading unwritten chunks returns the fill value."""
+    store = MemoryStore()
+    fill = -1
+    arr = zarr.create_array(store=store, fill_value=fill, **arr_kwargs)
+    expected = np.full(arr.shape, fill, dtype=arr.dtype)
+    np.testing.assert_array_equal(arr[:], expected)
+
+
+write_then_read_cases = [
+    pytest.param(
+        slice(None),
+        np.s_[:],
+        id="full-write-full-read",
+    ),
+    pytest.param(
+        slice(5, 15),
+        np.s_[:],
+        id="partial-write-full-read",
+    ),
+    pytest.param(
+        slice(None),
+        np.s_[::3],
+        id="full-write-strided-read",
+    ),
+    pytest.param(
+        slice(None),
+        np.s_[10:20],
+        id="full-write-slice-read",
+    ),
+]
+
+
+@pytest.mark.parametrize(
+    "arr_kwargs",
+    [
+        pytest.param(
+            {
+                "shape": (100,),
+                "dtype": "float64",
+                "chunks": (10,),
+                "shards": None,
+                "compressors": None,
+            },
+            id="unsharded",
+        ),
+        pytest.param(
+            {
+                "shape": (100,),
+                "dtype": "float64",
+                "chunks": (10,),
+                "shards": (100,),
+                "compressors": None,
+            },
+            id="sharded",
+        ),
+    ],
+)
+@pytest.mark.parametrize(("write_sel", "read_sel"), write_then_read_cases)
+async def test_write_then_read(
+    pipeline_class: str,
+    arr_kwargs: dict[str, Any],
+    write_sel: slice,
+    read_sel: slice,
+) -> None:
+    """Various write + read selection combinations produce correct results."""
+    store = MemoryStore()
+    arr = zarr.create_array(store=store, fill_value=0.0, **arr_kwargs)
+    full = np.zeros(arr.shape, dtype=arr.dtype)
+
+    write_data = np.arange(len(full[write_sel]), dtype=arr.dtype) + 1
+    full[write_sel] = write_data
+    arr[write_sel] = write_data
+
+    np.testing.assert_array_equal(arr[read_sel], full[read_sel])
+
+
+# ---------------------------------------------------------------------------
+# write_empty_chunks / read_missing_chunks config tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "arr_kwargs",
+    [
+        pytest.param(
+            {
+                "shape": (20,),
+                "dtype": "float64",
+                "chunks": (10,),
+                "shards": None,
+                "compressors": None,
+            },
+            id="unsharded",
+        ),
+        pytest.param(
+            {
+                "shape": (20,),
+                "dtype": "float64",
+                "chunks": (10,),
+                "shards": (20,),
+                "compressors": None,
+            },
+            id="sharded",
+        ),
+    ],
+)
+async def test_write_empty_chunks_false(pipeline_class: str, arr_kwargs: dict[str, Any]) -> None:
+    """With write_empty_chunks=False, writing fill_value should not persist the chunk."""
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        fill_value=0.0,
+        config={"write_empty_chunks": False},
+        **arr_kwargs,
+    )
+    # Write non-fill to first chunk, fill_value to second chunk
+    arr[0:10] = np.arange(10, dtype="float64") + 1
+    arr[10:20] = np.zeros(10, dtype="float64")  # all fill_value
+
+    # Read back — both chunks should return correct data
+    np.testing.assert_array_equal(arr[0:10], np.arange(10, dtype="float64") + 1)
+    np.testing.assert_array_equal(arr[10:20], np.zeros(10, dtype="float64"))
+
+
+async def test_write_empty_chunks_true(pipeline_class: str) -> None:
+    """With write_empty_chunks=True, fill_value chunks should still be stored."""
+    store: dict[str, Any] = {}
+    arr = zarr.create_array(
+        store=store,
+        shape=(20,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+        config={"write_empty_chunks": True},
+    )
+    arr[:] = 0.0  # all fill_value
+
+    # With write_empty_chunks=True, chunks should be persisted even though
+    # they equal the fill value.
+    assert "c/0" in store
+    assert "c/1" in store
+
+
+async def test_write_empty_chunks_false_no_store(pipeline_class: str) -> None:
+    """With write_empty_chunks=False, fill_value-only chunks should not be stored."""
+    store: dict[str, Any] = {}
+    arr = zarr.create_array(
+        store=store,
+        shape=(20,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+        config={"write_empty_chunks": False},
+    )
+    arr[:] = 0.0  # all fill_value
+
+    # Chunks should NOT be persisted
+    assert "c/0" not in store
+    assert "c/1" not in store
+
+    # But reading should still return fill values
+    np.testing.assert_array_equal(arr[:], np.zeros(20, dtype="float64"))
+
+
+async def test_read_missing_chunks_false_raises(pipeline_class: str) -> None:
+    """With read_missing_chunks=False, reading a missing chunk should raise."""
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(20,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+        config={"read_missing_chunks": False},
+    )
+    # Don't write anything — all chunks are missing
+    with pytest.raises(ChunkNotFoundError):
+        arr[:]
+
+
+async def test_read_missing_chunks_true_fills(pipeline_class: str) -> None:
+    """With read_missing_chunks=True (default), missing chunks return fill_value."""
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(20,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=-999.0,
+    )
+    # Don't write anything
+    np.testing.assert_array_equal(arr[:], np.full(20, -999.0))
+
+
+async def test_nested_sharding_roundtrip(pipeline_class: str) -> None:
+    """Nested sharding: data survives write/read roundtrip."""
+    from zarr.codecs.bytes import BytesCodec
+    from zarr.codecs.sharding import ShardingCodec
+
+    inner_sharding = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec()])
+    outer_sharding = ShardingCodec(chunk_shape=(50,), codecs=[inner_sharding])
+
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="uint8",
+        chunks=(100,),
+        compressors=None,
+        fill_value=0,
+        serializer=outer_sharding,
+    )
+    data = np.arange(100, dtype="uint8")
+    arr[:] = data
+    np.testing.assert_array_equal(arr[:], data)
+    # Partial read
+    np.testing.assert_array_equal(arr[40:60], data[40:60])
diff --git a/tests/test_config.py b/tests/test_config.py
index 4e293e968f..9ae133a4a4 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -63,6 +63,7 @@ def test_config_defaults_set() -> None:
                 "codec_pipeline": {
                     "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
                     "batch_size": 1,
+                    "max_workers": None,
                 },
                 "codecs": {
                     "blosc": "zarr.codecs.blosc.BloscCodec",
@@ -134,7 +135,7 @@ def test_config_codec_pipeline_class(store: Store) -> None:
     # has default value
     assert get_pipeline_class().__name__ != ""
 
-    config.set({"codec_pipeline.name": "zarr.core.codec_pipeline.BatchedCodecPipeline"})
+    config.set({"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"})
     assert get_pipeline_class() == zarr.core.codec_pipeline.BatchedCodecPipeline
 
     _mock = Mock()
diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_sync_codec_pipeline.py
index 1bfde7c837..f161dd39da 100644
--- a/tests/test_sync_codec_pipeline.py
+++ b/tests/test_sync_codec_pipeline.py
@@ -58,8 +58,8 @@ def _make_nd_buffer(arr: np.ndarray[Any, np.dtype[Any]]) -> NDBuffer:
 )
 def test_construction(shape: tuple[int, ...], codecs: tuple[Codec, ...]) -> None:
     """Construction succeeds when all codecs implement SupportsSyncCodec."""
-    spec = _make_array_spec(shape, np.dtype("float64"))
-    ChunkTransform(codecs=codecs, array_spec=spec)
+    _ = _make_array_spec(shape, np.dtype("float64"))
+    ChunkTransform(codecs=codecs)
 
 
 @pytest.mark.parametrize(
@@ -72,9 +72,9 @@ def test_construction(shape: tuple[int, ...], codecs: tuple[Codec, ...]) -> None
 )
 def test_construction_rejects_non_sync(shape: tuple[int, ...], codecs: tuple[Codec, ...]) -> None:
     """Construction raises TypeError when any codec lacks SupportsSyncCodec."""
-    spec = _make_array_spec(shape, np.dtype("float64"))
+    _ = _make_array_spec(shape, np.dtype("float64"))
     with pytest.raises(TypeError, match="AsyncOnlyCodec"):
-        ChunkTransform(codecs=codecs, array_spec=spec)
+        ChunkTransform(codecs=codecs)
 
 
 @pytest.mark.parametrize(
@@ -96,12 +96,12 @@ def test_encode_decode_roundtrip(
 ) -> None:
     """Data survives a full encode/decode cycle."""
     spec = _make_array_spec(arr.shape, arr.dtype)
-    chain = ChunkTransform(codecs=codecs, array_spec=spec)
+    chain = ChunkTransform(codecs=codecs)
     nd_buf = _make_nd_buffer(arr)
 
-    encoded = chain.encode(nd_buf)
+    encoded = chain.encode_chunk(nd_buf, spec)
     assert encoded is not None
-    decoded = chain.decode(encoded)
+    decoded = chain.decode_chunk(encoded, spec)
     np.testing.assert_array_equal(arr, decoded.as_numpy_array())
 
 
@@ -122,7 +122,7 @@ def test_compute_encoded_size(
 ) -> None:
     """compute_encoded_size returns the correct byte length."""
     spec = _make_array_spec(shape, np.dtype("float64"))
-    chain = ChunkTransform(codecs=codecs, array_spec=spec)
+    chain = ChunkTransform(codecs=codecs)
     assert chain.compute_encoded_size(input_size, spec) == expected_size
 
 
@@ -138,8 +138,7 @@ def _encode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer
     spec = _make_array_spec((3, 4), np.dtype("float64"))
     chain = ChunkTransform(
         codecs=(NoneReturningAACodec(order=(1, 0)), BytesCodec()),
-        array_spec=spec,
     )
     arr = np.arange(12, dtype="float64").reshape(3, 4)
     nd_buf = _make_nd_buffer(arr)
-    assert chain.encode(nd_buf) is None
+    assert chain.encode_chunk(nd_buf, spec) is None

From a48f4f7edbfd33f2e0bcd7c7382df1761b44d5b3 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 17 Apr 2026 22:53:47 +0200
Subject: [PATCH 09/22] feat: partial-shard write support in ShardingCodec

Adds _encode_partial_sync and _decode_partial_sync to ShardingCodec.
For fixed-size inner codec chains and stores that implement
SupportsSetRange, partial writes patch individual inner-chunk slots
in-place instead of rewriting the whole shard:

  - Reads existing shard index (one byte-range get).
  - For each affected inner chunk: decodes the slot, merges the new
    region, re-encodes.
  - Writes each modified slot at its deterministic byte offset, then
    rewrites just the index.

For variable-size inner codecs (e.g. with compression) or stores that
don't support byte-range writes, falls through to a full-shard rewrite
matching BatchedCodecPipeline semantics.

The partial-decode path computes a ReadPlan from the shard index and
issues one byte-range get per overlapping chunk, decoding only what
the read selection touches.

Both paths are dispatched from SyncCodecPipeline via the existing
supports_partial_decode / supports_partial_encode protocol checks.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/codecs/sharding.py | 465 +++++++++++++++++++++++++++-
 tests/test_sync_pipeline.py | 593 ++++++++++++++++++++++++++++++++++++
 2 files changed, 1057 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_sync_pipeline.py

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 609e32f87d..a64ce2bdab 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -307,6 +307,8 @@ class ShardingCodec(
 ):
     """Sharding codec"""
 
+    is_fixed_size = False
+
     chunk_shape: tuple[int, ...]
     codecs: tuple[Codec, ...]
     index_codecs: tuple[Codec, ...]
@@ -338,6 +340,12 @@ def __init__(
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
         object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec))
         object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard))
+        object.__setattr__(
+            self, "_get_inner_chunk_transform", lru_cache()(self._get_inner_chunk_transform)
+        )
+        object.__setattr__(
+            self, "_get_index_chunk_transform", lru_cache()(self._get_index_chunk_transform)
+        )
 
     # todo: typedict return type
     def __getstate__(self) -> dict[str, Any]:
@@ -354,6 +362,12 @@ def __setstate__(self, state: dict[str, Any]) -> None:
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
         object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec))
         object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard))
+        object.__setattr__(
+            self, "_get_inner_chunk_transform", lru_cache()(self._get_inner_chunk_transform)
+        )
+        object.__setattr__(
+            self, "_get_index_chunk_transform", lru_cache()(self._get_index_chunk_transform)
+        )
 
     @classmethod
     def from_dict(cls, data: dict[str, JSON]) -> Self:
@@ -362,7 +376,9 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
 
     @property
     def codec_pipeline(self) -> CodecPipeline:
-        return get_pipeline_class().from_codecs(self.codecs)
+        from zarr.core.codec_pipeline import BatchedCodecPipeline
+
+        return BatchedCodecPipeline.from_codecs(self.codecs)
 
     def to_dict(self) -> dict[str, JSON]:
         return {
@@ -412,6 +428,340 @@ def validate(
                         f"divisible by the shard's inner chunk size {inner}."
                     )
 
+    def _get_inner_chunk_transform(self, shard_spec: ArraySpec) -> Any:
+        """Build a ChunkTransform for the inner codec chain.
+
+        The cache key is the shard_spec because evolved codecs may
+        depend on it. The runtime chunk_spec is supplied per call.
+        """
+        from zarr.core.codec_pipeline import ChunkTransform
+
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        evolved = tuple(c.evolve_from_array_spec(array_spec=chunk_spec) for c in self.codecs)
+        return ChunkTransform(codecs=evolved)
+
+    def _get_index_chunk_transform(self, chunks_per_shard: tuple[int, ...]) -> Any:
+        """Build a ChunkTransform for the index codec chain."""
+        from zarr.core.codec_pipeline import ChunkTransform
+
+        index_spec = self._get_index_chunk_spec(chunks_per_shard)
+        evolved = tuple(c.evolve_from_array_spec(array_spec=index_spec) for c in self.index_codecs)
+        return ChunkTransform(codecs=evolved)
+
+    def _decode_shard_index_sync(
+        self, index_bytes: Buffer, chunks_per_shard: tuple[int, ...]
+    ) -> _ShardIndex:
+        """Decode shard index synchronously using ChunkTransform."""
+        index_transform = self._get_index_chunk_transform(chunks_per_shard)
+        index_spec = self._get_index_chunk_spec(chunks_per_shard)
+        index_array = index_transform.decode_chunk(index_bytes, index_spec)
+        return _ShardIndex(index_array.as_numpy_array())
+
+    def _encode_shard_index_sync(self, index: _ShardIndex) -> Buffer:
+        """Encode shard index synchronously using ChunkTransform."""
+        index_transform = self._get_index_chunk_transform(index.chunks_per_shard)
+        index_spec = self._get_index_chunk_spec(index.chunks_per_shard)
+        index_nd = get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths)
+        result: Buffer | None = index_transform.encode_chunk(index_nd, index_spec)
+        assert result is not None
+        return result
+
+    def _shard_reader_from_bytes_sync(
+        self, buf: Buffer, chunks_per_shard: tuple[int, ...]
+    ) -> _ShardReader:
+        """Sync version of _ShardReader.from_bytes."""
+        shard_index_size = self._shard_index_size(chunks_per_shard)
+        if self.index_location == ShardingCodecIndexLocation.start:
+            shard_index_bytes = buf[:shard_index_size]
+        else:
+            shard_index_bytes = buf[-shard_index_size:]
+        index = self._decode_shard_index_sync(shard_index_bytes, chunks_per_shard)
+        reader = _ShardReader()
+        reader.buf = buf
+        reader.index = index
+        return reader
+
+    def _decode_sync(
+        self,
+        shard_bytes: Buffer,
+        shard_spec: ArraySpec,
+    ) -> NDBuffer:
+        """Decode a full shard synchronously."""
+        shard_shape = shard_spec.shape
+        chunk_shape = self.chunk_shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = BasicIndexer(
+            tuple(slice(0, s) for s in shard_shape),
+            shape=shard_shape,
+            chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape),
+        )
+
+        out = chunk_spec.prototype.nd_buffer.empty(
+            shape=shard_shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        shard_dict = self._shard_reader_from_bytes_sync(shard_bytes, chunks_per_shard)
+
+        if shard_dict.index.is_all_empty():
+            out.fill(shard_spec.fill_value)
+            return out
+
+        for chunk_coords, chunk_selection, out_selection, _ in indexer:
+            try:
+                chunk_bytes = shard_dict[chunk_coords]
+            except KeyError:
+                out[out_selection] = shard_spec.fill_value
+                continue
+            chunk_array = inner_transform.decode_chunk(chunk_bytes, chunk_spec)
+            out[out_selection] = chunk_array[chunk_selection]
+
+        return out
+
+    def _encode_sync(
+        self,
+        shard_array: NDBuffer,
+        shard_spec: ArraySpec,
+    ) -> Buffer | None:
+        """Encode a full shard synchronously."""
+        shard_shape = shard_spec.shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = BasicIndexer(
+            tuple(slice(0, s) for s in shard_shape),
+            shape=shard_shape,
+            chunk_grid=ChunkGrid.from_sizes(shard_shape, self.chunk_shape),
+        )
+
+        shard_builder: dict[tuple[int, ...], Buffer | None] = dict.fromkeys(
+            morton_order_iter(chunks_per_shard)
+        )
+
+        skip_empty = not shard_spec.config.write_empty_chunks
+        fill_value = shard_spec.fill_value
+        if fill_value is None:
+            fill_value = shard_spec.dtype.default_scalar()
+
+        for chunk_coords, _chunk_selection, out_selection, _ in indexer:
+            chunk_array = shard_array[out_selection]
+            if skip_empty and chunk_array.all_equal(fill_value):
+                shard_builder[chunk_coords] = None
+            else:
+                encoded = inner_transform.encode_chunk(chunk_array, chunk_spec)
+                shard_builder[chunk_coords] = encoded
+
+        return self._encode_shard_dict_sync(
+            shard_builder,
+            chunks_per_shard=chunks_per_shard,
+            buffer_prototype=default_buffer_prototype(),
+        )
+
+    def _encode_partial_sync(
+        self,
+        byte_setter: Any,
+        value: NDBuffer,
+        selection: SelectorTuple,
+        shard_spec: ArraySpec,
+    ) -> None:
+        """Sync equivalent of ``_encode_partial_single``.
+
+        Receives the source data for the written region (not a pre-merged
+        shard array) and the selection within the shard, matching the
+        calling convention of the async partial-encode path used by
+        ``BatchedCodecPipeline``.
+
+        When inner codecs are fixed-size and the store supports
+        ``set_range_sync``, partial writes update only the affected inner
+        chunks at their deterministic byte offsets.  Otherwise falls back
+        to a full shard rewrite.
+        """
+        from zarr.abc.store import SupportsSetRange
+
+        shard_shape = shard_spec.shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = list(
+            get_indexer(
+                selection,
+                shape=shard_shape,
+                chunk_grid=ChunkGrid.from_sizes(shard_shape, self.chunk_shape),
+            )
+        )
+
+        is_complete = self._is_complete_shard_write(indexer, chunks_per_shard)
+
+        skip_empty = not shard_spec.config.write_empty_chunks
+        fill_value = shard_spec.fill_value
+        if fill_value is None:
+            fill_value = shard_spec.dtype.default_scalar()
+
+        is_scalar = len(value.shape) == 0
+
+        # --- Byte-range fast path ---
+        # Only safe when we don't need to skip empty chunks: byte-range
+        # writes leave chunk presence unchanged (writes a fixed-size
+        # data slot for every affected chunk). Compacting empty chunks
+        # away requires rewriting the whole shard.
+        store = byte_setter.store if hasattr(byte_setter, "store") else None
+        if (
+            not is_complete
+            and not skip_empty
+            and self._inner_codecs_fixed_size
+            and isinstance(store, SupportsSetRange)
+        ):
+            chunk_byte_length = self._inner_chunk_byte_length(chunk_spec)
+            n_chunks = product(chunks_per_shard)
+            shard_index_size = self._shard_index_size(chunks_per_shard)
+            total_data_size = n_chunks * chunk_byte_length
+            total_shard_size = total_data_size + shard_index_size
+
+            existing = byte_setter.get_sync(prototype=shard_spec.prototype)
+            if existing is not None and len(existing) == total_shard_size:
+                key = byte_setter.path if hasattr(byte_setter, "path") else str(byte_setter)
+                shard_reader = self._shard_reader_from_bytes_sync(existing, chunks_per_shard)
+                # The decoded index may be a view of a read-only buffer (e.g.
+                # mmap-backed reads from LocalStore). Copy so set_chunk_slice
+                # below can mutate it.
+                index = _ShardIndex(shard_reader.index.offsets_and_lengths.copy())
+
+                rank_map = {c: r for r, c in enumerate(morton_order_iter(chunks_per_shard))}
+
+                def _byte_offset(coords: tuple[int, ...]) -> int:
+                    offset = rank_map[coords] * chunk_byte_length
+                    if self.index_location == ShardingCodecIndexLocation.start:
+                        offset += shard_index_size
+                    return offset
+
+                for chunk_coords, chunk_sel, out_sel, is_complete_chunk in indexer:
+                    byte_offset = _byte_offset(chunk_coords)
+                    chunk_value = value if is_scalar else value[out_sel]
+
+                    if is_complete_chunk and not is_scalar:
+                        chunk_array = chunk_value
+                    else:
+                        # Decode existing inner chunk, then merge new data
+                        existing_chunk_bytes = existing[
+                            byte_offset : byte_offset + chunk_byte_length
+                        ]
+                        chunk_array = inner_transform.decode_chunk(
+                            existing_chunk_bytes, chunk_spec
+                        ).copy()
+                        chunk_array[chunk_sel] = chunk_value
+
+                    encoded = inner_transform.encode_chunk(chunk_array, chunk_spec)
+                    if encoded is not None:
+                        store.set_range_sync(key, encoded, byte_offset)
+                        index.set_chunk_slice(
+                            chunk_coords,
+                            slice(byte_offset, byte_offset + chunk_byte_length),
+                        )
+
+                index_bytes = self._encode_shard_index_sync(index)
+                if self.index_location == ShardingCodecIndexLocation.start:
+                    store.set_range_sync(key, index_bytes, 0)
+                else:
+                    store.set_range_sync(key, index_bytes, total_data_size)
+                return
+
+        # --- Full shard rewrite path ---
+        # Load existing inner-chunk bytes into a dict (same structure as
+        # the async path's shard_dict).
+        if is_complete:
+            shard_dict: dict[tuple[int, ...], Buffer | None] = dict.fromkeys(
+                morton_order_iter(chunks_per_shard)
+            )
+        else:
+            existing_bytes = byte_setter.get_sync(prototype=shard_spec.prototype)
+            if existing_bytes is not None:
+                shard_reader_fb = self._shard_reader_from_bytes_sync(
+                    existing_bytes, chunks_per_shard
+                )
+                shard_dict = {}
+                for coords in morton_order_iter(chunks_per_shard):
+                    try:
+                        shard_dict[coords] = shard_reader_fb[coords]
+                    except KeyError:
+                        shard_dict[coords] = None
+            else:
+                shard_dict = dict.fromkeys(morton_order_iter(chunks_per_shard))
+
+        # Merge, encode, and store each affected inner chunk into shard_dict.
+        for chunk_coords, chunk_sel, out_sel, is_complete_chunk in indexer:
+            chunk_value = value if is_scalar else value[out_sel]
+
+            if is_complete_chunk and not is_scalar:
+                chunk_array = chunk_value
+            else:
+                existing_raw = shard_dict.get(chunk_coords)
+                if existing_raw is not None:
+                    chunk_array = inner_transform.decode_chunk(existing_raw, chunk_spec).copy()
+                else:
+                    chunk_array = chunk_spec.prototype.nd_buffer.create(
+                        shape=self.chunk_shape,
+                        dtype=shard_spec.dtype.to_native_dtype(),
+                        order=shard_spec.order,
+                        fill_value=fill_value,
+                    )
+                chunk_array[chunk_sel] = chunk_value
+
+            if skip_empty and chunk_array.all_equal(fill_value):
+                shard_dict[chunk_coords] = None
+            else:
+                shard_dict[chunk_coords] = inner_transform.encode_chunk(chunk_array, chunk_spec)
+
+        blob = self._encode_shard_dict_sync(
+            shard_dict,
+            chunks_per_shard=chunks_per_shard,
+            buffer_prototype=default_buffer_prototype(),
+        )
+        if blob is None:
+            byte_setter.delete_sync()
+        else:
+            byte_setter.set_sync(blob)
+
+    def _encode_shard_dict_sync(
+        self,
+        shard_dict: ShardMapping,
+        chunks_per_shard: tuple[int, ...],
+        buffer_prototype: BufferPrototype,
+    ) -> Buffer | None:
+        """Sync version of _encode_shard_dict."""
+        index = _ShardIndex.create_empty(chunks_per_shard)
+        buffers = []
+        template = buffer_prototype.buffer.create_zero_length()
+        chunk_start = 0
+
+        for chunk_coords in morton_order_iter(chunks_per_shard):
+            value = shard_dict.get(chunk_coords)
+            if value is None or len(value) == 0:
+                continue
+            chunk_length = len(value)
+            buffers.append(value)
+            index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length))
+            chunk_start += chunk_length
+
+        if len(buffers) == 0:
+            return None
+
+        index_bytes = self._encode_shard_index_sync(index)
+        if self.index_location == ShardingCodecIndexLocation.start:
+            empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64
+            index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes)
+            index_bytes = self._encode_shard_index_sync(index)
+            buffers.insert(0, index_bytes)
+        else:
+            buffers.append(index_bytes)
+
+        return template.combine(buffers)
+
     async def _decode_single(
         self,
         shard_bytes: Buffer,
@@ -532,6 +882,92 @@ async def _decode_partial_single(
         else:
             return out
 
+    def _decode_partial_sync(
+        self,
+        byte_getter: Any,
+        selection: SelectorTuple,
+        shard_spec: ArraySpec,
+    ) -> NDBuffer | None:
+        """Sync equivalent of ``_decode_partial_single``.
+
+        Reads only the inner-chunk byte ranges that overlap ``selection``
+        (plus the shard index) and decodes them through the inner codec
+        chain.  The store must support ``get_sync`` with byte ranges.
+        """
+        shard_shape = shard_spec.shape
+        chunk_shape = self.chunk_shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = get_indexer(
+            selection,
+            shape=shard_shape,
+            chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape),
+        )
+
+        out = shard_spec.prototype.nd_buffer.empty(
+            shape=indexer.shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        indexed_chunks = list(indexer)
+        all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}
+
+        # Read just the inner chunks we need.
+        if self._is_total_shard(all_chunk_coords, chunks_per_shard):
+            shard_bytes = byte_getter.get_sync(prototype=chunk_spec.prototype)
+            if shard_bytes is None:
+                return None
+            shard_reader = self._shard_reader_from_bytes_sync(shard_bytes, chunks_per_shard)
+            shard_dict: ShardMapping = shard_reader
+        else:
+            shard_index_size = self._shard_index_size(chunks_per_shard)
+            if self.index_location == ShardingCodecIndexLocation.start:
+                index_bytes = byte_getter.get_sync(
+                    prototype=numpy_buffer_prototype(),
+                    byte_range=RangeByteRequest(0, shard_index_size),
+                )
+            else:
+                index_bytes = byte_getter.get_sync(
+                    prototype=numpy_buffer_prototype(),
+                    byte_range=SuffixByteRequest(shard_index_size),
+                )
+            if index_bytes is None:
+                return None
+            shard_index = self._decode_shard_index_sync(index_bytes, chunks_per_shard)
+            shard_dict_mut: dict[tuple[int, ...], Buffer | None] = {}
+            for chunk_coords in all_chunk_coords:
+                chunk_byte_slice = shard_index.get_chunk_slice(chunk_coords)
+                if chunk_byte_slice is not None:
+                    chunk_bytes = byte_getter.get_sync(
+                        prototype=chunk_spec.prototype,
+                        byte_range=RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]),
+                    )
+                    if chunk_bytes is not None:
+                        shard_dict_mut[chunk_coords] = chunk_bytes
+            shard_dict = shard_dict_mut
+
+        # Decode each needed inner chunk and scatter into out.
+        fill_value = shard_spec.fill_value
+        if fill_value is None:
+            fill_value = shard_spec.dtype.default_scalar()
+        for chunk_coords, chunk_selection, out_selection, _ in indexed_chunks:
+            try:
+                chunk_bytes = shard_dict[chunk_coords]
+            except KeyError:
+                chunk_bytes = None
+            if chunk_bytes is None:
+                out[out_selection] = fill_value
+                continue
+            chunk_array = inner_transform.decode_chunk(chunk_bytes, chunk_spec)
+            out[out_selection] = chunk_array[chunk_selection]
+
+        if hasattr(indexer, "sel_shape"):
+            return out.reshape(indexer.sel_shape)
+        return out
+
     async def _encode_single(
         self,
         shard_array: NDBuffer,
@@ -797,6 +1233,33 @@ async def _load_full_shard_maybe(
             else None
         )
 
+    @property
+    def _inner_codecs_fixed_size(self) -> bool:
+        """True when all inner codecs produce fixed-size output (no compression)."""
+        return all(c.is_fixed_size for c in self.codecs)
+
+    def _inner_chunk_byte_length(self, chunk_spec: ArraySpec) -> int:
+        """Encoded byte length of a single inner chunk. Only valid when _inner_codecs_fixed_size."""
+        raw_byte_length = 1
+        for s in self.chunk_shape:
+            raw_byte_length *= s
+        raw_byte_length *= chunk_spec.dtype.item_size  # type: ignore[attr-defined]
+        return int(self.codec_pipeline.compute_encoded_size(raw_byte_length, chunk_spec))
+
+    def _chunk_byte_offset(
+        self,
+        chunk_coords: tuple[int, ...],
+        chunks_per_shard: tuple[int, ...],
+        chunk_byte_length: int,
+    ) -> int:
+        """Byte offset of an inner chunk within a dense shard blob."""
+        rank_map = {c: r for r, c in enumerate(morton_order_iter(chunks_per_shard))}
+        rank = rank_map[chunk_coords]
+        offset = rank * chunk_byte_length
+        if self.index_location == ShardingCodecIndexLocation.start:
+            offset += self._shard_index_size(chunks_per_shard)
+        return offset
+
     def compute_encoded_size(self, input_byte_length: int, shard_spec: ArraySpec) -> int:
         chunks_per_shard = self._get_chunks_per_shard(shard_spec)
         return input_byte_length + self._shard_index_size(chunks_per_shard)
diff --git a/tests/test_sync_pipeline.py b/tests/test_sync_pipeline.py
new file mode 100644
index 0000000000..1df182b9c5
--- /dev/null
+++ b/tests/test_sync_pipeline.py
@@ -0,0 +1,593 @@
+"""Tests for SyncCodecPipeline -- the sync-first codec pipeline."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import pytest
+
+import zarr
+from zarr.abc.store import SupportsSetRange
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.zstd import ZstdCodec
+from zarr.core.buffer import cpu
+from zarr.core.codec_pipeline import SyncCodecPipeline
+from zarr.storage import MemoryStore, StorePath
+
+
+def _create_array(
+    shape: tuple[int, ...],
+    dtype: str = "float64",
+    chunks: tuple[int, ...] | None = None,
+    codecs: tuple[Any, ...] = (BytesCodec(),),
+    fill_value: object = 0,
+) -> zarr.Array[Any]:
+    """Create a zarr array using SyncCodecPipeline."""
+    if chunks is None:
+        chunks = shape
+
+    _ = SyncCodecPipeline.from_codecs(codecs)
+
+    return zarr.create_array(
+        StorePath(MemoryStore()),
+        shape=shape,
+        dtype=dtype,
+        chunks=chunks,
+        filters=[c for c in codecs if not isinstance(c, BytesCodec)],
+        serializer=BytesCodec() if any(isinstance(c, BytesCodec) for c in codecs) else "auto",
+        compressors=None,
+        fill_value=fill_value,
+    )
+
+
+@pytest.mark.parametrize(
+    "codecs",
+    [
+        (BytesCodec(),),
+        (BytesCodec(), GzipCodec(level=1)),
+        (BytesCodec(), ZstdCodec(level=1)),
+        (TransposeCodec(order=(1, 0)), BytesCodec()),
+        (TransposeCodec(order=(1, 0)), BytesCodec(), ZstdCodec(level=1)),
+    ],
+    ids=["bytes-only", "gzip", "zstd", "transpose", "transpose+zstd"],
+)
+def test_construction(codecs: tuple[Any, ...]) -> None:
+    """SyncCodecPipeline can be constructed from valid codec combinations."""
+    pipeline = SyncCodecPipeline.from_codecs(codecs)
+    assert pipeline.codecs == codecs
+
+
+def test_evolve_from_array_spec() -> None:
+    """evolve_from_array_spec creates a sync transform."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    assert pipeline._sync_transform is None
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(100,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+    evolved = pipeline.evolve_from_array_spec(spec)
+    assert evolved._sync_transform is not None
+
+
+@pytest.mark.parametrize(
+    ("dtype", "shape"),
+    [
+        ("float64", (100,)),
+        ("float32", (50,)),
+        ("int32", (200,)),
+        ("float64", (10, 10)),
+    ],
+    ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
+)
+def test_read_write_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
+    """Data written through SyncCodecPipeline can be read back correctly via async path."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+    from zarr.core.sync import sync
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    spec = ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    # Write
+    data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_selection = tuple(slice(0, s) for s in shape)
+    out_selection = chunk_selection
+
+    store_path = StorePath(store, "c/0")
+    sync(
+        pipeline.write(
+            [(store_path, spec, chunk_selection, out_selection, True)],
+            value,
+        )
+    )
+
+    # Read
+    out = CPUNDBuffer.from_numpy_array(np.zeros(shape, dtype=dtype))
+    sync(
+        pipeline.read(
+            [(store_path, spec, chunk_selection, out_selection, True)],
+            out,
+        )
+    )
+
+    np.testing.assert_array_equal(data, out.as_numpy_array())
+
+
+def test_read_missing_chunk_fills() -> None:
+    """Reading a missing chunk fills with the fill value."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+    from zarr.core.sync import sync
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(10,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(42.0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
+    store_path = StorePath(store, "c/0")
+    chunk_sel = (slice(0, 10),)
+
+    sync(
+        pipeline.read(
+            [(store_path, spec, chunk_sel, chunk_sel, True)],
+            out,
+        )
+    )
+
+    np.testing.assert_array_equal(out.as_numpy_array(), np.full(10, 42.0))
+
+
+# ---------------------------------------------------------------------------
+# Sync path tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    ("dtype", "shape"),
+    [
+        ("float64", (100,)),
+        ("float32", (50,)),
+        ("int32", (200,)),
+        ("float64", (10, 10)),
+    ],
+    ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
+)
+def test_read_write_sync_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
+    """Data written via write_sync can be read back via read_sync."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    spec = ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_selection = tuple(slice(0, s) for s in shape)
+    out_selection = chunk_selection
+    store_path = StorePath(store, "c/0")
+
+    # Write sync
+    pipeline.write_sync(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        value,
+    )
+
+    # Read sync
+    out = CPUNDBuffer.from_numpy_array(np.zeros(shape, dtype=dtype))
+    pipeline.read_sync(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(data, out.as_numpy_array())
+
+
+def test_read_sync_missing_chunk_fills() -> None:
+    """Sync read of a missing chunk fills with the fill value."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(10,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(42.0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
+    store_path = StorePath(store, "c/0")
+    chunk_sel = (slice(0, 10),)
+
+    pipeline.read_sync(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(out.as_numpy_array(), np.full(10, 42.0))
+
+
+def test_sync_write_async_read_roundtrip() -> None:
+    """Data written via write_sync can be read back via async read."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+    from zarr.core.sync import sync
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(100,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    data = np.arange(100, dtype="float64")
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_sel = (slice(0, 100),)
+    store_path = StorePath(store, "c/0")
+
+    # Write sync
+    pipeline.write_sync(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        value,
+    )
+
+    # Read async
+    out = CPUNDBuffer.from_numpy_array(np.zeros(100, dtype="float64"))
+    sync(
+        pipeline.read(
+            [(store_path, spec, chunk_sel, chunk_sel, True)],
+            out,
+        )
+    )
+
+
+def test_sync_transform_encode_decode_roundtrip() -> None:
+    """Sync transform can encode and decode a chunk."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.dtype import Float64
+
+    codecs = (BytesCodec(),)
+    pipeline = SyncCodecPipeline.from_codecs(codecs)
+    zdtype = Float64()
+    spec = ArraySpec(
+        shape=(100,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0.0),
+        prototype=default_buffer_prototype(),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+    )
+    pipeline = pipeline.evolve_from_array_spec(spec)
+    assert pipeline._sync_transform is not None
+
+    # Encode
+    proto = default_buffer_prototype()
+    data = proto.nd_buffer.from_numpy_array(np.arange(100, dtype="float64"))
+    encoded = pipeline._sync_transform.encode_chunk(data, spec)
+    assert encoded is not None
+
+    # Decode
+    decoded = pipeline._sync_transform.decode_chunk(encoded, spec)
+    np.testing.assert_array_equal(decoded.as_numpy_array(), np.arange(100, dtype="float64"))
+
+
+# ---------------------------------------------------------------------------
+# Streaming read tests
+# ---------------------------------------------------------------------------
+
+
+def test_streaming_read_multiple_chunks() -> None:
+    """Read with multiple chunks should produce correct results via streaming pipeline."""
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+    )
+    data = np.arange(100, dtype="float64")
+    arr[:] = data
+    result = arr[:]
+    np.testing.assert_array_equal(result, data)
+
+
+def test_streaming_read_strided_slice() -> None:
+    """Strided slicing should work correctly with streaming read."""
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+    )
+    data = np.arange(100, dtype="float64")
+    arr[:] = data
+    result = arr[::3]
+    np.testing.assert_array_equal(result, data[::3])
+
+
+def test_streaming_read_missing_chunks() -> None:
+    """Reading chunks that were never written should return fill value."""
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=-1.0,
+    )
+    result = arr[:]
+    np.testing.assert_array_equal(result, np.full(100, -1.0))
+
+
+# ---------------------------------------------------------------------------
+# Streaming write tests
+# ---------------------------------------------------------------------------
+
+
+def test_streaming_write_complete_overwrite() -> None:
+    """Complete overwrite should skip fetching existing data."""
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+    )
+    data = np.arange(100, dtype="float64")
+    arr[:] = data
+    np.testing.assert_array_equal(arr[:], data)
+
+
+def test_streaming_write_partial_update() -> None:
+    """Partial updates should correctly merge with existing data."""
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+    )
+    arr[:] = np.ones(100)
+    arr[5:15] = np.full(10, 99.0)
+    result = arr[:]
+    expected = np.ones(100)
+    expected[5:15] = 99.0
+    np.testing.assert_array_equal(result, expected)
+
+
+def test_memory_store_supports_byte_range_setter() -> None:
+    """MemoryStore should implement SupportsSetRange."""
+    store = zarr.storage.MemoryStore()
+    assert isinstance(store, SupportsSetRange)
+
+
+async def test_memory_store_set_range() -> None:
+    """MemoryStore.set_range should overwrite bytes at the given offset."""
+    store = zarr.storage.MemoryStore()
+    await store._ensure_open()
+    buf = cpu.Buffer.from_bytes(b"AAAAAAAAAA")  # 10 bytes
+    await store.set("test/key", buf)
+
+    patch = cpu.Buffer.from_bytes(b"XX")
+    await store.set_range("test/key", patch, start=3)
+
+    result = await store.get("test/key", prototype=cpu.buffer_prototype)
+    assert result is not None
+    assert result.to_bytes() == b"AAAXXAAAAA"
+
+
+def test_sharding_codec_inner_codecs_fixed_size_no_compression() -> None:
+    """Inner codecs without compression should be fixed-size."""
+    from zarr.codecs.sharding import ShardingCodec
+
+    codec = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec()])
+    assert codec._inner_codecs_fixed_size is True
+
+
+def test_sharding_codec_inner_codecs_fixed_size_with_compression() -> None:
+    """Inner codecs with compression should NOT be fixed-size."""
+    from zarr.codecs.sharding import ShardingCodec
+
+    codec = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec(), GzipCodec()])
+    assert codec._inner_codecs_fixed_size is False
+
+
+def test_partial_shard_write_fixed_size() -> None:
+    """Writing a single element to a shard with fixed-size codecs should work correctly."""
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=(100,),
+        compressors=None,
+        fill_value=0.0,
+    )
+    arr[:] = np.arange(100, dtype="float64")
+    arr[5] = 999.0
+    result = arr[:]
+    expected = np.arange(100, dtype="float64")
+    expected[5] = 999.0
+    np.testing.assert_array_equal(result, expected)
+
+
+def test_partial_shard_write_roundtrip_correctness() -> None:
+    """Multiple partial writes to different inner chunks should all be correct."""
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=(100,),
+        compressors=None,
+        fill_value=0.0,
+    )
+    arr[:] = np.zeros(100, dtype="float64")
+    arr[0:10] = np.ones(10)
+    arr[50:60] = np.full(10, 2.0)
+    arr[90:100] = np.full(10, 3.0)
+    result = arr[:]
+    expected = np.zeros(100)
+    expected[0:10] = 1.0
+    expected[50:60] = 2.0
+    expected[90:100] = 3.0
+    np.testing.assert_array_equal(result, expected)
+
+
+def test_partial_shard_write_uses_set_range() -> None:
+    """Partial shard writes with fixed-size codecs should use set_range_sync.
+
+    Only the SyncCodecPipeline uses byte-range writes for partial shard
+    updates; skipped under other pipelines.
+    """
+    from unittest.mock import patch
+
+    store = zarr.storage.MemoryStore()
+    # write_empty_chunks=True keeps a fixed-size dense layout, which is
+    # required for the byte-range fast path (chunks never transition
+    # present <-> absent).
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=(100,),
+        compressors=None,
+        fill_value=0.0,
+        config={"write_empty_chunks": True},
+    )
+    if not isinstance(arr._async_array.codec_pipeline, SyncCodecPipeline):
+        pytest.skip("byte-range write optimization is specific to SyncCodecPipeline")
+
+    # Initial full write to create the shard blob
+    arr[:] = np.arange(100, dtype="float64")
+
+    # Partial write — should use set_range_sync, not set_sync
+    with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock_set_range:
+        arr[5] = 999.0
+
+    # set_range_sync should be called: once for the chunk data, once for the index
+    assert mock_set_range.call_count >= 1, (
+        "Expected set_range_sync to be called for partial shard write"
+    )
+
+    # Verify correctness
+    expected = np.arange(100, dtype="float64")
+    expected[5] = 999.0
+    np.testing.assert_array_equal(arr[:], expected)
+
+
+def test_partial_shard_write_falls_back_for_compressed() -> None:
+    """Partial shard writes with compressed inner codecs should NOT use set_range.
+
+    Only meaningful under SyncCodecPipeline (which can use byte-range writes
+    for fixed-size inner codecs). Other pipelines never use set_range_sync,
+    so the assertion is trivially true and the test is uninformative.
+    """
+    from unittest.mock import patch
+
+    store = zarr.storage.MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        dtype="float64",
+        chunks=(10,),
+        shards=(100,),
+        compressors=GzipCodec(),
+        fill_value=0.0,
+    )
+    if not isinstance(arr._async_array.codec_pipeline, SyncCodecPipeline):
+        pytest.skip("byte-range write optimization is specific to SyncCodecPipeline")
+    arr[:] = np.arange(100, dtype="float64")
+
+    with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock_set_range:
+        arr[5] = 999.0
+
+    # With compression, set_range_sync should NOT be used
+    assert mock_set_range.call_count == 0, (
+        "set_range_sync should not be used with compressed inner codecs"
+    )
+
+    expected = np.arange(100, dtype="float64")
+    expected[5] = 999.0
+    np.testing.assert_array_equal(arr[:], expected)

From fba975e061c4ecbd396512fe27beaf0627dd89af Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 17 Apr 2026 22:57:42 +0200
Subject: [PATCH 10/22] test: codec invariants + pipeline parity matrix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two new test files:

  test_codec_invariants — asserts contract-level properties that every
  codec / shard / buffer combination must satisfy: round-trip exactness,
  prototype propagation, fill-value handling, all-empty shard handling.

  test_pipeline_parity — exhaustive matrix asserting that
  SyncCodecPipeline and BatchedCodecPipeline produce semantically
  identical results across codec configs, layouts (including
  nested sharding), write sequences, and write_empty_chunks settings.
  Three checks per cell:
    1. Same array contents on read.
    2. Same set of store keys after writes.
    3. Each pipeline reads the other's output identically (catches
       layout-divergence bugs).

These tests pinned the design throughout the SyncCodecPipeline +
partial-shard development.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_codec_invariants.py | 320 +++++++++++++++++++++++++++
 tests/test_pipeline_parity.py  | 385 +++++++++++++++++++++++++++++++++
 2 files changed, 705 insertions(+)
 create mode 100644 tests/test_codec_invariants.py
 create mode 100644 tests/test_pipeline_parity.py

diff --git a/tests/test_codec_invariants.py b/tests/test_codec_invariants.py
new file mode 100644
index 0000000000..5ddf4cfd93
--- /dev/null
+++ b/tests/test_codec_invariants.py
@@ -0,0 +1,320 @@
+"""Codec / shard / buffer invariants.
+
+These tests enforce the contracts described in
+``docs/superpowers/specs/2026-04-17-codec-pipeline-invariants.md``.
+They exist to catch the class of bug where pipeline code reasons
+case-by-case about how codecs, shards, IO, and buffers interact and
+silently breaks a combination.
+
+Each test is short and focused on one invariant. If any test here
+fails, the corresponding section of the design doc points at what
+contract was broken.
+"""
+
+from __future__ import annotations
+
+from dataclasses import replace
+from typing import TYPE_CHECKING, Any
+from unittest.mock import patch
+
+import numpy as np
+import pytest
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+import zarr
+from zarr.abc.codec import BytesBytesCodec, Codec
+from zarr.abc.store import SupportsSetRange
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.crc32c_ import Crc32cCodec
+from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.zstd import ZstdCodec
+from zarr.core.array_spec import ArrayConfig, ArraySpec
+from zarr.core.buffer import Buffer, default_buffer_prototype
+from zarr.core.codec_pipeline import ChunkTransform, SyncCodecPipeline
+from zarr.core.dtype import get_data_type_from_native_dtype
+from zarr.storage import LocalStore, MemoryStore
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _spec(
+    shape: tuple[int, ...] = (10,),
+    dtype: str = "float64",
+    *,
+    fill_value: object = 0.0,
+    write_empty_chunks: bool = False,
+) -> ArraySpec:
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    return ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(fill_value),
+        config=ArrayConfig(order="C", write_empty_chunks=write_empty_chunks),
+        prototype=default_buffer_prototype(),
+    )
+
+
+# ---------------------------------------------------------------------------
+# C1: Codecs only mutate `shape`
+# ---------------------------------------------------------------------------
+
+# Codecs that we expect to satisfy C1 unconditionally. Each is in a
+# state where calling resolve_metadata is safe with the helper spec.
+_C1_CODECS: list[Codec] = [
+    BytesCodec(),
+    Crc32cCodec(),
+    GzipCodec(level=1),
+    ZstdCodec(level=1),
+    TransposeCodec(order=(0,)),
+]
+
+
+@pytest.mark.parametrize("codec", _C1_CODECS, ids=lambda c: type(c).__name__)
+def test_C1_resolve_metadata_only_mutates_shape(codec: Codec) -> None:
+    """C1: prototype, dtype, fill_value, config never change across the codec chain."""
+    spec_in = _spec()
+    spec_out = codec.resolve_metadata(spec_in)
+    assert spec_out.prototype is spec_in.prototype, f"{type(codec).__name__} changed prototype"
+    assert spec_out.dtype == spec_in.dtype, f"{type(codec).__name__} changed dtype"
+    assert spec_out.fill_value == spec_in.fill_value, f"{type(codec).__name__} changed fill_value"
+    assert spec_out.config == spec_in.config, f"{type(codec).__name__} changed config"
+
+
+# ---------------------------------------------------------------------------
+# C2: Each codec call receives the runtime chunk_spec
+# ---------------------------------------------------------------------------
+
+
+class _PrototypeRecordingCodec(BytesBytesCodec):  # type: ignore[misc,unused-ignore]
+    """A no-op BB codec that records the prototype it was called with."""
+
+    is_fixed_size = True
+    seen_prototypes: list[object]
+
+    def __init__(self) -> None:
+        object.__setattr__(self, "seen_prototypes", [])
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"name": "_prototype_recording", "configuration": {}}
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> _PrototypeRecordingCodec:
+        return cls()
+
+    def compute_encoded_size(self, input_byte_length: int, _spec: ArraySpec) -> int:
+        return input_byte_length
+
+    def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer:
+        self.seen_prototypes.append(chunk_spec.prototype)
+        return chunk_bytes
+
+    def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None:
+        self.seen_prototypes.append(chunk_spec.prototype)
+        return chunk_bytes
+
+    async def _decode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer:
+        return self._decode_sync(chunk_bytes, chunk_spec)
+
+    async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None:
+        return self._encode_sync(chunk_bytes, chunk_spec)
+
+
+def test_C2_chunk_transform_uses_runtime_prototype() -> None:
+    """C2: the prototype the codec sees comes from the runtime chunk_spec, not a cache."""
+    from zarr.core.buffer import BufferPrototype
+
+    recording = _PrototypeRecordingCodec()
+    transform = ChunkTransform(codecs=(BytesCodec(), recording))
+
+    proto_default = default_buffer_prototype()
+    # A distinct BufferPrototype instance with the same buffer/nd_buffer
+    # types — fails identity check but works at runtime.
+    proto_other = BufferPrototype(buffer=proto_default.buffer, nd_buffer=proto_default.nd_buffer)
+    assert proto_other is not proto_default
+
+    spec_a = replace(_spec(), prototype=proto_default)
+    spec_b = replace(_spec(), prototype=proto_other)
+
+    arr = proto_default.nd_buffer.from_numpy_array(np.arange(10, dtype="float64"))
+    transform.encode_chunk(arr, spec_a)
+    transform.encode_chunk(arr, spec_b)
+
+    assert recording.seen_prototypes[0] is proto_default
+    assert recording.seen_prototypes[1] is proto_other, (
+        "ChunkTransform did not pass the runtime prototype to the codec"
+    )
+
+
+# ---------------------------------------------------------------------------
+# C3: pipeline never branches on codec type
+# ---------------------------------------------------------------------------
+
+
+def test_C3_pipeline_methods_do_not_isinstance_check_sharding_codec() -> None:
+    """C3: Pipeline read/write methods must use supports_partial_*, not isinstance(ShardingCodec).
+
+    Static check: scan the pipeline classes' read/write methods for
+    `isinstance(..., ShardingCodec)`. Other helpers (e.g. metadata
+    validation in `codecs_from_list`) may legitimately need the check.
+    """
+    import inspect
+    import re
+
+    from zarr.core.codec_pipeline import BatchedCodecPipeline, SyncCodecPipeline
+
+    pattern = re.compile(r"isinstance\s*\([^)]*ShardingCodec[^)]*\)")
+
+    for cls in (SyncCodecPipeline, BatchedCodecPipeline):
+        for method_name in ("read", "write", "read_sync", "write_sync"):
+            method = getattr(cls, method_name, None)
+            if method is None:
+                continue
+            source = inspect.getsource(method)
+            matches = pattern.findall(source)
+            assert not matches, (
+                f"{cls.__name__}.{method_name} contains isinstance check on "
+                f"ShardingCodec; use supports_partial_encode/decode instead. "
+                f"Matches: {matches}"
+            )
+
+
+# ---------------------------------------------------------------------------
+# S1 + S2: shard layout is compact and skips empty chunks by default
+# ---------------------------------------------------------------------------
+
+
+def test_S2_empty_chunks_omitted_under_default_config() -> None:
+    """S2: writing fill-value data must not produce store keys for those chunks."""
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(20,),
+        chunks=(10,),
+        shards=None,
+        dtype="float64",
+        compressors=None,
+        fill_value=0.0,
+    )
+    # Write fill values to the second chunk; assert no key created for it.
+    arr[10:20] = 0.0
+    assert "c/1" not in store._store_dict
+
+
+def test_S2_empty_shard_deleted_after_partial_writes_to_fill() -> None:
+    """S2: a sharded array where all inner chunks become fill should drop the shard."""
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(16,),
+        chunks=(4,),
+        shards=(8,),
+        dtype="float64",
+        compressors=None,
+        fill_value=0.0,
+    )
+    # Fill the first shard with non-fill data, then overwrite back to fill.
+    arr[0:8] = np.arange(8, dtype="float64") + 1
+    assert "c/0" in store._store_dict
+    arr[0:8] = 0.0
+    assert "c/0" not in store._store_dict, "shard should be deleted when fully empty"
+
+
+# ---------------------------------------------------------------------------
+# S3: byte-range fast path requires write_empty_chunks=True
+# ---------------------------------------------------------------------------
+
+
+def _is_sync_pipeline_default() -> bool:
+    """Check whether SyncCodecPipeline is the active pipeline."""
+    store = MemoryStore()
+    arr = zarr.create_array(store=store, shape=(8,), chunks=(8,), dtype="uint8", fill_value=0)
+    return isinstance(arr._async_array.codec_pipeline, SyncCodecPipeline)
+
+
+def test_S3_byte_range_path_skipped_when_write_empty_chunks_false() -> None:
+    """S3: under default config, partial shard writes do not call set_range_sync."""
+    if not _is_sync_pipeline_default():
+        pytest.skip("byte-range fast path is specific to SyncCodecPipeline")
+
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        chunks=(10,),
+        shards=(100,),
+        dtype="float64",
+        compressors=None,
+        fill_value=0.0,
+        # Default config: write_empty_chunks=False
+    )
+    arr[:] = np.arange(100, dtype="float64")
+    with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock:
+        arr[5] = 999.0
+    assert mock.call_count == 0, (
+        "byte-range fast path was taken with write_empty_chunks=False; "
+        "this would produce a dense shard layout incompatible with empty-chunk skipping"
+    )
+
+
+def test_S3_byte_range_path_used_when_write_empty_chunks_true() -> None:
+    """S3: with write_empty_chunks=True, partial shard writes use set_range_sync."""
+    if not _is_sync_pipeline_default():
+        pytest.skip("byte-range fast path is specific to SyncCodecPipeline")
+
+    store = MemoryStore()
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        chunks=(10,),
+        shards=(100,),
+        dtype="float64",
+        compressors=None,
+        fill_value=0.0,
+        config={"write_empty_chunks": True},
+    )
+    arr[:] = np.arange(100, dtype="float64")
+    with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock:
+        arr[5] = 999.0
+    assert mock.call_count >= 1, "byte-range fast path was not taken with write_empty_chunks=True"
+
+
+# ---------------------------------------------------------------------------
+# B1: code that mutates buffers from store IO must copy first
+# ---------------------------------------------------------------------------
+
+
+def test_B1_partial_shard_write_handles_readonly_store_buffers(tmp_path: Path) -> None:
+    """B1: LocalStore returns read-only buffers; mutating-paths must copy."""
+    store = LocalStore(tmp_path / "data.zarr")
+    arr = zarr.create_array(
+        store=store,
+        shape=(16,),
+        chunks=(4,),
+        shards=(8,),
+        dtype="float64",
+        compressors=None,
+        fill_value=0.0,
+        config={"write_empty_chunks": True},
+    )
+    arr[:] = np.arange(16, dtype="float64")
+    # This triggers the byte-range path which decodes the shard index from
+    # a (potentially read-only) store buffer and then mutates it. If the
+    # decode result isn't copied, the next line raises
+    # `ValueError: assignment destination is read-only`.
+    arr[2] = 42.0
+    assert arr[2] == 42.0
+
+
+# ---------------------------------------------------------------------------
+# Sanity: SupportsSetRange is correctly implemented
+# ---------------------------------------------------------------------------
+
+
+def test_supports_set_range_is_runtime_checkable() -> None:
+    """Stores should report SupportsSetRange membership via isinstance."""
+    assert isinstance(MemoryStore(), SupportsSetRange)
diff --git a/tests/test_pipeline_parity.py b/tests/test_pipeline_parity.py
new file mode 100644
index 0000000000..3352966d8a
--- /dev/null
+++ b/tests/test_pipeline_parity.py
@@ -0,0 +1,385 @@
+"""Pipeline parity test — exhaustive matrix of read/write scenarios.
+
+For every cell of the matrix (codec config x layout x operation
+sequence x runtime config), assert that ``SyncCodecPipeline`` and
+``BatchedCodecPipeline`` produce semantically identical results:
+
+  * Same returned array contents on read.
+  * Same set of store keys after writes (catches divergent empty-shard
+    handling: one pipeline deletes, the other writes an empty blob).
+  * Reading each pipeline's store contents through the *other* pipeline
+    yields the same array (catches "wrote a layout that only one
+    pipeline can read" bugs).
+
+Pipeline-divergence bugs (e.g. one pipeline writes a dense shard
+layout while the other writes a compact layout) fail this test
+loudly with a clear diff, instead of waiting for a downstream
+test to trip over the symptom.
+
+Byte-for-byte equality of store contents is intentionally NOT
+checked: codecs like gzip embed the wall-clock timestamp in their
+output, so two compressions of the same data done at different
+seconds produce different bytes despite being semantically
+identical.
+
+The matrix axes are:
+
+  * codec chain — bytes-only, gzip, with/without sharding
+  * layout — chunk_shape, shard_shape (None for no sharding)
+  * write sequence — full overwrite, partial in middle, scalar to one
+    cell, multiple overlapping writes, sequence ending in fill values
+  * runtime config — write_empty_chunks True/False
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import pytest
+
+import zarr
+from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.sharding import ShardingCodec
+from zarr.core.config import config as zarr_config
+from zarr.storage import MemoryStore
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Iterator
+
+
+# ---------------------------------------------------------------------------
+# Reference helpers
+# ---------------------------------------------------------------------------
+
+
+def _store_snapshot(store: MemoryStore) -> dict[str, bytes]:
+    """Return {key: bytes} for every entry in the store."""
+    return {k: bytes(v.to_bytes()) for k, v in store._store_dict.items()}
+
+
+# ---------------------------------------------------------------------------
+# Matrix definitions
+# ---------------------------------------------------------------------------
+
+
+# Each codec config is (filters, serializer, compressors). We only vary the
+# pieces that actually affect the pipeline. compressors=None means a
+# fixed-size chain (the byte-range fast path is eligible when sharded).
+CodecConfig = dict[str, Any]
+
+CODEC_CONFIGS: list[tuple[str, CodecConfig]] = [
+    ("bytes-only", {"compressors": None}),
+    ("gzip", {"compressors": GzipCodec(level=1)}),
+]
+
+
+# (id, kwargs) — chunks/shards layout. kwargs are passed to create_array.
+LayoutConfig = dict[str, Any]
+
+LAYOUT_CONFIGS: list[tuple[str, LayoutConfig]] = [
+    ("1d-unsharded", {"shape": (100,), "chunks": (10,), "shards": None}),
+    ("1d-1chunk-per-shard", {"shape": (100,), "chunks": (10,), "shards": (10,)}),
+    ("1d-multi-chunk-per-shard", {"shape": (100,), "chunks": (10,), "shards": (50,)}),
+    ("2d-unsharded", {"shape": (20, 20), "chunks": (5, 5), "shards": None}),
+    ("2d-sharded", {"shape": (20, 20), "chunks": (5, 5), "shards": (10, 10)}),
+    # Nested sharding: outer chunk (10,10) sharded into inner chunks (5,5).
+    # Restricted to bytes-only codec because combining an outer ShardingCodec
+    # with a compressor (gzip) triggers a ZarrUserWarning and results in a
+    # checksum mismatch inside the inner shard index — a known limitation, not
+    # a pipeline-parity bug.  The bytes-only path still exercises the full
+    # two-level shard encoding/decoding in both pipelines.
+    (
+        "2d-nested-sharded",
+        {
+            "shape": (20, 20),
+            "chunks": (10, 10),
+            "shards": None,
+            "serializer": ShardingCodec(
+                chunk_shape=(10, 10),
+                codecs=[ShardingCodec(chunk_shape=(5, 5))],
+            ),
+            # Only run with the bytes-only codec config; gzip is incompatible
+            # with nested sharding (see comment above).
+            "_codec_ids": {"bytes-only"},
+        },
+    ),
+]
+
+
+WriteOp = tuple[Any, Any]  # (selection, value)
+WriteSequence = tuple[str, list[WriteOp]]
+
+
+def _full_overwrite(shape: tuple[int, ...]) -> list[WriteOp]:
+    return [((slice(None),) * len(shape), np.arange(int(np.prod(shape))).reshape(shape) + 1)]
+
+
+def _partial_middle(shape: tuple[int, ...]) -> list[WriteOp]:
+    if len(shape) == 1:
+        n = shape[0]
+        return [((slice(n // 4, 3 * n // 4),), 7)]
+    # 2D: write a centered block
+    rs = slice(shape[0] // 4, 3 * shape[0] // 4)
+    cs = slice(shape[1] // 4, 3 * shape[1] // 4)
+    return [((rs, cs), 7)]
+
+
+def _scalar_one_cell(shape: tuple[int, ...]) -> list[WriteOp]:
+    if len(shape) == 1:
+        return [((shape[0] // 2,), 99)]
+    return [((shape[0] // 2, shape[1] // 2), 99)]
+
+
+def _overlapping(shape: tuple[int, ...]) -> list[WriteOp]:
+    if len(shape) == 1:
+        n = shape[0]
+        return [
+            ((slice(0, n // 2),), 1),
+            ((slice(n // 4, 3 * n // 4),), 2),
+            ((slice(n // 2, n),), 3),
+        ]
+    rs1, cs1 = slice(0, shape[0] // 2), slice(0, shape[1] // 2)
+    rs2, cs2 = slice(shape[0] // 4, 3 * shape[0] // 4), slice(shape[1] // 4, 3 * shape[1] // 4)
+    return [((rs1, cs1), 1), ((rs2, cs2), 2)]
+
+
+def _ends_in_fill(shape: tuple[int, ...]) -> list[WriteOp]:
+    """Write something then overwrite it with fill — exercises empty-chunk handling."""
+    full = (slice(None),) * len(shape)
+    return [(full, 5), (full, 0)]
+
+
+def _ends_in_partial_fill(shape: tuple[int, ...]) -> list[WriteOp]:
+    """Write data, then overwrite half with fill — some chunks become empty."""
+    full: tuple[slice, ...]
+    half: tuple[slice, ...]
+    if len(shape) == 1:
+        full = (slice(None),)
+        half = (slice(0, shape[0] // 2),)
+    else:
+        full = (slice(None), slice(None))
+        half = (slice(0, shape[0] // 2), slice(None))
+    return [(full, 5), (half, 0)]
+
+
+SEQUENCES: list[tuple[str, Callable[[tuple[int, ...]], list[WriteOp]]]] = [
+    ("full-overwrite", _full_overwrite),
+    ("partial-middle", _partial_middle),
+    ("scalar-one-cell", _scalar_one_cell),
+    ("overlapping", _overlapping),
+    ("ends-in-fill", _ends_in_fill),
+    ("ends-in-partial-fill", _ends_in_partial_fill),
+]
+
+
+WRITE_EMPTY_CHUNKS = [False, True]
+
+
+# ---------------------------------------------------------------------------
+# Matrix iteration (pruned)
+# ---------------------------------------------------------------------------
+
+
+def _matrix() -> Iterator[Any]:
+    for codec_id, codec_kwargs in CODEC_CONFIGS:
+        for layout_id, layout in LAYOUT_CONFIGS:
+            allowed = layout.get("_codec_ids")
+            if allowed is not None and codec_id not in allowed:
+                continue
+            for seq_id, seq_fn in SEQUENCES:
+                for wec in WRITE_EMPTY_CHUNKS:
+                    yield pytest.param(
+                        codec_kwargs,
+                        layout,
+                        seq_fn,
+                        wec,
+                        id=f"{layout_id}-{codec_id}-{seq_id}-wec{wec}",
+                    )
+
+
+# ---------------------------------------------------------------------------
+# The parity test
+# ---------------------------------------------------------------------------
+
+
+def _write_under_pipeline(
+    pipeline_path: str,
+    codec_kwargs: CodecConfig,
+    layout: LayoutConfig,
+    sequence: list[WriteOp],
+    write_empty_chunks: bool,
+) -> tuple[MemoryStore, Any]:
+    """Apply a sequence of writes via the chosen pipeline.
+
+    Returns (store with the written data, final array contents read back).
+    """
+    # Strip private metadata keys (e.g. "_codec_ids") before passing to create_array.
+    array_layout = {k: v for k, v in layout.items() if not k.startswith("_")}
+    store = MemoryStore()
+    with zarr_config.set({"codec_pipeline.path": pipeline_path}):
+        arr = zarr.create_array(
+            store=store,
+            dtype="float64",
+            fill_value=0.0,
+            config={"write_empty_chunks": write_empty_chunks},
+            **array_layout,
+            **codec_kwargs,
+        )
+        for sel, val in sequence:
+            arr[sel] = val
+        contents = arr[...]
+    return store, contents
+
+
+def _read_under_pipeline(pipeline_path: str, store: MemoryStore) -> Any:
+    """Re-open an existing store under the chosen pipeline and read it whole."""
+    with zarr_config.set({"codec_pipeline.path": pipeline_path}):
+        arr = zarr.open_array(store=store, mode="r")
+        return arr[...]
+
+
+_BATCHED = "zarr.core.codec_pipeline.BatchedCodecPipeline"
+_SYNC = "zarr.core.codec_pipeline.SyncCodecPipeline"
+
+
+@pytest.mark.parametrize(
+    ("codec_kwargs", "layout", "sequence_fn", "write_empty_chunks"),
+    list(_matrix()),
+)
+def test_pipeline_parity(
+    codec_kwargs: CodecConfig,
+    layout: LayoutConfig,
+    sequence_fn: Callable[[tuple[int, ...]], list[WriteOp]],
+    write_empty_chunks: bool,
+) -> None:
+    """SyncCodecPipeline must be semantically identical to BatchedCodecPipeline.
+
+    Three checks, in order of decreasing diagnostic value:
+
+      1. Both pipelines return the same array contents after the same
+         write sequence (catches semantic correctness bugs).
+      2. Both pipelines produce the same set of store keys (catches
+         empty-shard divergence: one deletes, the other doesn't).
+      3. Each pipeline can correctly read the *other* pipeline's
+         output (catches layout-divergence bugs that would prevent
+         interop, e.g. dense vs compact shard layouts).
+
+    Byte-for-byte store equality is intentionally not checked: codecs
+    like gzip embed wall-clock timestamps that vary between runs.
+    """
+    sequence = sequence_fn(layout["shape"])
+
+    batched_store, batched_arr = _write_under_pipeline(
+        _BATCHED, codec_kwargs, layout, sequence, write_empty_chunks
+    )
+    sync_store, sync_arr = _write_under_pipeline(
+        _SYNC, codec_kwargs, layout, sequence, write_empty_chunks
+    )
+
+    # 1. Array contents must agree.
+    np.testing.assert_array_equal(
+        sync_arr,
+        batched_arr,
+        err_msg="SyncCodecPipeline returned different array contents than BatchedCodecPipeline",
+    )
+
+    # 2. Store key sets must agree.
+    batched_keys = set(batched_store._store_dict) - {"zarr.json"}
+    sync_keys = set(sync_store._store_dict) - {"zarr.json"}
+    assert sync_keys == batched_keys, (
+        f"Pipelines disagree on which store keys exist.\n"
+        f"  only in batched: {sorted(batched_keys - sync_keys)}\n"
+        f"  only in sync:    {sorted(sync_keys - batched_keys)}"
+    )
+
+    # 3. Cross-read: each pipeline must correctly read the other's output.
+    sync_reads_batched = _read_under_pipeline(_SYNC, batched_store)
+    batched_reads_sync = _read_under_pipeline(_BATCHED, sync_store)
+    np.testing.assert_array_equal(
+        sync_reads_batched,
+        batched_arr,
+        err_msg="SyncCodecPipeline could not correctly read BatchedCodecPipeline's output",
+    )
+    np.testing.assert_array_equal(
+        batched_reads_sync,
+        sync_arr,
+        err_msg="BatchedCodecPipeline could not correctly read SyncCodecPipeline's output",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Read parity: cover partial reads (not just full reads as in the matrix above)
+# ---------------------------------------------------------------------------
+
+
+def _read_selections(shape: tuple[int, ...]) -> list[tuple[str, Any]]:
+    """Selections that exercise the partial-decode path differently."""
+    if len(shape) == 1:
+        n = shape[0]
+        return [
+            ("scalar-first", (0,)),
+            ("scalar-mid", (n // 2,)),
+            ("partial-slice", (slice(n // 4, 3 * n // 4),)),
+            ("strided", (slice(0, n, 3),)),
+            ("full", (slice(None),)),
+        ]
+    return [
+        ("scalar-first", (0,) * len(shape)),
+        ("scalar-mid", tuple(s // 2 for s in shape)),
+        ("partial-slice", tuple(slice(s // 4, 3 * s // 4) for s in shape)),
+        ("full", (slice(None),) * len(shape)),
+    ]
+
+
+def _read_matrix() -> Iterator[Any]:
+    for codec_id, codec_kwargs in CODEC_CONFIGS:
+        for layout_id, layout in LAYOUT_CONFIGS:
+            allowed = layout.get("_codec_ids")
+            if allowed is not None and codec_id not in allowed:
+                continue
+            for sel_id, sel in _read_selections(layout["shape"]):
+                yield pytest.param(
+                    codec_kwargs,
+                    layout,
+                    sel,
+                    id=f"{layout_id}-{codec_id}-{sel_id}",
+                )
+
+
+@pytest.mark.parametrize(
+    ("codec_kwargs", "layout", "selection"),
+    list(_read_matrix()),
+)
+def test_pipeline_read_parity(
+    codec_kwargs: CodecConfig,
+    layout: LayoutConfig,
+    selection: Any,
+) -> None:
+    """Partial reads via SyncCodecPipeline must match BatchedCodecPipeline.
+
+    The full-write/full-read parity test above doesn't exercise partial
+    reads (e.g. a single element from a sharded array), which take a
+    different code path (``_decode_partial_single`` on the sharding
+    codec). This test fills the array under one pipeline and reads
+    arbitrary selections under both, asserting equality.
+    """
+    # Fill under batched (the canonical pipeline) so the contents are
+    # well-defined regardless of the codec under test.
+    store, _full = _write_under_pipeline(
+        _BATCHED, codec_kwargs, layout, _full_overwrite(layout["shape"]), True
+    )
+
+    with zarr_config.set({"codec_pipeline.path": _BATCHED}):
+        batched_arr = zarr.open_array(store=store, mode="r")[selection]
+    with zarr_config.set({"codec_pipeline.path": _SYNC}):
+        sync_arr = zarr.open_array(store=store, mode="r")[selection]
+
+    np.testing.assert_array_equal(
+        sync_arr,
+        batched_arr,
+        err_msg=(
+            f"SyncCodecPipeline read returned different result than BatchedCodecPipeline "
+            f"for selection {selection!r}"
+        ),
+    )

From 1be556319213fd43b11ca09b6d612d79f6f4c8dd Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 17 Apr 2026 22:57:48 +0200
Subject: [PATCH 11/22] chore: gitignore local agent/planning notes

Adds .gitignore entries for .claude/, CLAUDE.md, and docs/superpowers/
so local IDE/agent planning artifacts don't get committed by accident.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.gitignore b/.gitignore
index b79ce264c8..5e3b44d213 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,8 @@ tests/.hypothesis
 
 zarr/version.py
 zarr.egg-info/
+
+# Local agent / planning notes (not versioned)
+.claude/
+CLAUDE.md
+docs/superpowers/

From 8b23d22fb99ebae29f30501cc035d1615f9c5cee Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 30 Apr 2026 21:22:55 -0400
Subject: [PATCH 12/22] chore: remove unused PreparedWrite and
 SupportsChunkCodec

Both were exported from zarr.abc.codec.__all__ but never referenced
by either codec pipeline or any test. Artifacts of an earlier design
iteration superseded by the current SyncCodecPipeline.

Also remove now-unused imports of `dataclass` and `ChunkProjection`
that were only needed by the deleted symbols.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/abc/codec.py | 55 +------------------------------------------
 1 file changed, 1 insertion(+), 54 deletions(-)

diff --git a/src/zarr/abc/codec.py b/src/zarr/abc/codec.py
index ae8a78a34d..eed2119aff 100644
--- a/src/zarr/abc/codec.py
+++ b/src/zarr/abc/codec.py
@@ -2,7 +2,6 @@
 
 from abc import abstractmethod
 from collections.abc import Mapping
-from dataclasses import dataclass
 from typing import TYPE_CHECKING, Literal, Protocol, TypeGuard, runtime_checkable
 
 from typing_extensions import ReadOnly, TypedDict
@@ -19,7 +18,7 @@
     from zarr.abc.store import ByteGetter, ByteSetter, Store
     from zarr.core.array_spec import ArraySpec
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
-    from zarr.core.indexing import ChunkProjection, SelectorTuple
+    from zarr.core.indexing import SelectorTuple
     from zarr.core.metadata import ArrayMetadata
     from zarr.core.metadata.v3 import ChunkGridMetadata
 
@@ -34,8 +33,6 @@
     "CodecOutput",
     "CodecPipeline",
     "GetResult",
-    "PreparedWrite",
-    "SupportsChunkCodec",
     "SupportsSyncCodec",
 ]
 
@@ -85,25 +82,6 @@ def _decode_sync(self, chunk_data: CO, chunk_spec: ArraySpec) -> CI: ...
     def _encode_sync(self, chunk_data: CI, chunk_spec: ArraySpec) -> CO | None: ...
 
 
-class SupportsChunkCodec(Protocol):
-    """Protocol for objects that can decode/encode whole chunks synchronously.
-
-    `ChunkTransform` satisfies this protocol. The ``chunk_shape`` parameter
-    allows decoding/encoding chunks of different shapes (e.g. rectilinear
-    grids) without rebuilding the transform.
-    """
-
-    array_spec: ArraySpec
-
-    def decode_chunk(
-        self, chunk_bytes: Buffer, chunk_shape: tuple[int, ...] | None = None
-    ) -> NDBuffer: ...
-
-    def encode_chunk(
-        self, chunk_array: NDBuffer, chunk_shape: tuple[int, ...] | None = None
-    ) -> Buffer | None: ...
-
-
 class BaseCodec[CI: CodecInput, CO: CodecOutput](Metadata):
     """Generic base class for codecs.
 
@@ -229,37 +207,6 @@ class ArrayArrayCodec(BaseCodec[NDBuffer, NDBuffer]):
     """Base class for array-to-array codecs."""
 
 
-@dataclass
-class PreparedWrite:
-    """Intermediate state between reading existing data and writing new data.
-
-    Created by `prepare_write_sync` / `prepare_write`, consumed by
-    `finalize_write_sync` / `finalize_write`. The compute phase sits
-    in between: iterate over `indexer`, decode the corresponding entry
-    in `chunk_dict`, merge new data, re-encode, and store the result
-    back into `chunk_dict`.
-
-    Attributes
-    ----------
-    chunk_dict : dict[tuple[int, ...], Buffer | None]
-        Per-inner-chunk encoded bytes, keyed by chunk coordinates.
-        For a regular array this is `{(0,): <bytes>}`. For a sharded
-        array it contains one entry per inner chunk in the shard,
-        including chunks not being modified (they pass through
-        unchanged). `None` means the chunk did not exist on disk.
-    indexer : list[ChunkProjection]
-        The inner chunks to modify. Each entry's `chunk_coords`
-        corresponds to a key in `chunk_dict`. `chunk_selection`
-        identifies the region within that inner chunk, and
-        `out_selection` identifies the corresponding region in the
-        source value array. This is a subset of `chunk_dict`'s keys
-        — untouched chunks are not listed.
-    """
-
-    chunk_dict: dict[tuple[int, ...], Buffer | None]
-    indexer: list[ChunkProjection]
-
-
 class ArrayBytesCodec(BaseCodec[NDBuffer, Buffer]):
     """Base class for array-to-bytes codecs."""
 

From cd3c14b137e85c3e4eef0be06dfb38b845f3b2ea Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 30 Apr 2026 21:24:51 -0400
Subject: [PATCH 13/22] chore: remove stale phased-pipeline test files

Both tests/test_phased_codec_pipeline.py and tests/test_pipeline_benchmark.py
import PhasedCodecPipeline, which no longer exists in src/. Each failed at
collection. The benchmarking intent of test_pipeline_benchmark.py is replaced
by extensions to tests/benchmarks/test_e2e.py later in this branch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_phased_codec_pipeline.py | 293 ----------------------------
 tests/test_pipeline_benchmark.py    | 168 ----------------
 2 files changed, 461 deletions(-)
 delete mode 100644 tests/test_phased_codec_pipeline.py
 delete mode 100644 tests/test_pipeline_benchmark.py

diff --git a/tests/test_phased_codec_pipeline.py b/tests/test_phased_codec_pipeline.py
deleted file mode 100644
index 902cc2ff20..0000000000
--- a/tests/test_phased_codec_pipeline.py
+++ /dev/null
@@ -1,293 +0,0 @@
-"""Tests for PhasedCodecPipeline — the three-phase prepare/compute/finalize pipeline."""
-
-from __future__ import annotations
-
-from typing import Any
-
-import numpy as np
-import pytest
-
-import zarr
-from zarr.codecs.bytes import BytesCodec
-from zarr.codecs.gzip import GzipCodec
-from zarr.codecs.transpose import TransposeCodec
-from zarr.codecs.zstd import ZstdCodec
-from zarr.core.codec_pipeline import PhasedCodecPipeline
-from zarr.storage import MemoryStore, StorePath
-
-
-def _create_array(
-    shape: tuple[int, ...],
-    dtype: str = "float64",
-    chunks: tuple[int, ...] | None = None,
-    codecs: tuple[Any, ...] = (BytesCodec(),),
-    fill_value: object = 0,
-) -> zarr.Array[Any]:
-    """Create a zarr array using PhasedCodecPipeline."""
-    if chunks is None:
-        chunks = shape
-
-    _ = PhasedCodecPipeline.from_codecs(codecs)
-
-    return zarr.create_array(
-        StorePath(MemoryStore()),
-        shape=shape,
-        dtype=dtype,
-        chunks=chunks,
-        filters=[c for c in codecs if not isinstance(c, BytesCodec)],
-        serializer=BytesCodec() if any(isinstance(c, BytesCodec) for c in codecs) else "auto",
-        compressors=None,
-        fill_value=fill_value,
-    )
-
-
-@pytest.mark.parametrize(
-    "codecs",
-    [
-        (BytesCodec(),),
-        (BytesCodec(), GzipCodec(level=1)),
-        (BytesCodec(), ZstdCodec(level=1)),
-        (TransposeCodec(order=(1, 0)), BytesCodec()),
-        (TransposeCodec(order=(1, 0)), BytesCodec(), ZstdCodec(level=1)),
-    ],
-    ids=["bytes-only", "gzip", "zstd", "transpose", "transpose+zstd"],
-)
-def test_construction(codecs: tuple[Any, ...]) -> None:
-    """PhasedCodecPipeline can be constructed from valid codec combinations."""
-    pipeline = PhasedCodecPipeline.from_codecs(codecs)
-    assert pipeline.codecs == codecs
-
-
-def test_evolve_from_array_spec() -> None:
-    """evolve_from_array_spec creates a ChunkTransform."""
-    from zarr.core.array_spec import ArrayConfig, ArraySpec
-    from zarr.core.buffer import default_buffer_prototype
-    from zarr.core.dtype import get_data_type_from_native_dtype
-
-    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
-    assert pipeline.chunk_transform is None
-
-    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
-    spec = ArraySpec(
-        shape=(100,),
-        dtype=zdtype,
-        fill_value=zdtype.cast_scalar(0),
-        config=ArrayConfig(order="C", write_empty_chunks=True),
-        prototype=default_buffer_prototype(),
-    )
-    evolved = pipeline.evolve_from_array_spec(spec)
-    assert evolved.chunk_transform is not None
-
-
-@pytest.mark.parametrize(
-    ("dtype", "shape"),
-    [
-        ("float64", (100,)),
-        ("float32", (50,)),
-        ("int32", (200,)),
-        ("float64", (10, 10)),
-    ],
-    ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
-)
-async def test_read_write_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
-    """Data written through PhasedCodecPipeline can be read back correctly."""
-    from zarr.core.array_spec import ArrayConfig, ArraySpec
-    from zarr.core.buffer import default_buffer_prototype
-    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
-    from zarr.core.dtype import get_data_type_from_native_dtype
-
-    store = MemoryStore()
-    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
-    spec = ArraySpec(
-        shape=shape,
-        dtype=zdtype,
-        fill_value=zdtype.cast_scalar(0),
-        config=ArrayConfig(order="C", write_empty_chunks=True),
-        prototype=default_buffer_prototype(),
-    )
-
-    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
-    pipeline = pipeline.evolve_from_array_spec(spec)
-
-    # Write
-    data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
-    value = CPUNDBuffer.from_numpy_array(data)
-    chunk_selection = tuple(slice(0, s) for s in shape)
-    out_selection = chunk_selection
-
-    store_path = StorePath(store, "c/0")
-    await pipeline.write(
-        [(store_path, spec, chunk_selection, out_selection, True)],
-        value,
-    )
-
-    # Read
-    out = CPUNDBuffer.from_numpy_array(np.zeros(shape, dtype=dtype))
-    await pipeline.read(
-        [(store_path, spec, chunk_selection, out_selection, True)],
-        out,
-    )
-
-    np.testing.assert_array_equal(data, out.as_numpy_array())
-
-
-async def test_read_missing_chunk_fills() -> None:
-    """Reading a missing chunk fills with the fill value."""
-    from zarr.core.array_spec import ArrayConfig, ArraySpec
-    from zarr.core.buffer import default_buffer_prototype
-    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
-    from zarr.core.dtype import get_data_type_from_native_dtype
-
-    store = MemoryStore()
-    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
-    spec = ArraySpec(
-        shape=(10,),
-        dtype=zdtype,
-        fill_value=zdtype.cast_scalar(42.0),
-        config=ArrayConfig(order="C", write_empty_chunks=True),
-        prototype=default_buffer_prototype(),
-    )
-
-    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
-    pipeline = pipeline.evolve_from_array_spec(spec)
-
-    out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
-    store_path = StorePath(store, "c/0")
-    chunk_sel = (slice(0, 10),)
-
-    await pipeline.read(
-        [(store_path, spec, chunk_sel, chunk_sel, True)],
-        out,
-    )
-
-    np.testing.assert_array_equal(out.as_numpy_array(), np.full(10, 42.0))
-
-
-# ---------------------------------------------------------------------------
-# Sync path tests
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.parametrize(
-    ("dtype", "shape"),
-    [
-        ("float64", (100,)),
-        ("float32", (50,)),
-        ("int32", (200,)),
-        ("float64", (10, 10)),
-    ],
-    ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
-)
-def test_read_write_sync_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
-    """Data written via write_sync can be read back via read_sync."""
-    from zarr.core.array_spec import ArrayConfig, ArraySpec
-    from zarr.core.buffer import default_buffer_prototype
-    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
-    from zarr.core.dtype import get_data_type_from_native_dtype
-
-    store = MemoryStore()
-    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
-    spec = ArraySpec(
-        shape=shape,
-        dtype=zdtype,
-        fill_value=zdtype.cast_scalar(0),
-        config=ArrayConfig(order="C", write_empty_chunks=True),
-        prototype=default_buffer_prototype(),
-    )
-
-    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
-    pipeline = pipeline.evolve_from_array_spec(spec)
-
-    data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
-    value = CPUNDBuffer.from_numpy_array(data)
-    chunk_selection = tuple(slice(0, s) for s in shape)
-    out_selection = chunk_selection
-    store_path = StorePath(store, "c/0")
-
-    # Write sync
-    pipeline.write_sync(
-        [(store_path, spec, chunk_selection, out_selection, True)],
-        value,
-    )
-
-    # Read sync
-    out = CPUNDBuffer.from_numpy_array(np.zeros(shape, dtype=dtype))
-    pipeline.read_sync(
-        [(store_path, spec, chunk_selection, out_selection, True)],
-        out,
-    )
-
-    np.testing.assert_array_equal(data, out.as_numpy_array())
-
-
-def test_read_sync_missing_chunk_fills() -> None:
-    """Sync read of a missing chunk fills with the fill value."""
-    from zarr.core.array_spec import ArrayConfig, ArraySpec
-    from zarr.core.buffer import default_buffer_prototype
-    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
-    from zarr.core.dtype import get_data_type_from_native_dtype
-
-    store = MemoryStore()
-    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
-    spec = ArraySpec(
-        shape=(10,),
-        dtype=zdtype,
-        fill_value=zdtype.cast_scalar(42.0),
-        config=ArrayConfig(order="C", write_empty_chunks=True),
-        prototype=default_buffer_prototype(),
-    )
-
-    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
-    pipeline = pipeline.evolve_from_array_spec(spec)
-
-    out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
-    store_path = StorePath(store, "c/0")
-    chunk_sel = (slice(0, 10),)
-
-    pipeline.read_sync(
-        [(store_path, spec, chunk_sel, chunk_sel, True)],
-        out,
-    )
-
-    np.testing.assert_array_equal(out.as_numpy_array(), np.full(10, 42.0))
-
-
-async def test_sync_write_async_read_roundtrip() -> None:
-    """Data written via write_sync can be read back via async read."""
-    from zarr.core.array_spec import ArrayConfig, ArraySpec
-    from zarr.core.buffer import default_buffer_prototype
-    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
-    from zarr.core.dtype import get_data_type_from_native_dtype
-
-    store = MemoryStore()
-    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
-    spec = ArraySpec(
-        shape=(100,),
-        dtype=zdtype,
-        fill_value=zdtype.cast_scalar(0),
-        config=ArrayConfig(order="C", write_empty_chunks=True),
-        prototype=default_buffer_prototype(),
-    )
-
-    pipeline = PhasedCodecPipeline.from_codecs((BytesCodec(),))
-    pipeline = pipeline.evolve_from_array_spec(spec)
-
-    data = np.arange(100, dtype="float64")
-    value = CPUNDBuffer.from_numpy_array(data)
-    chunk_sel = (slice(0, 100),)
-    store_path = StorePath(store, "c/0")
-
-    # Write sync
-    pipeline.write_sync(
-        [(store_path, spec, chunk_sel, chunk_sel, True)],
-        value,
-    )
-
-    # Read async
-    out = CPUNDBuffer.from_numpy_array(np.zeros(100, dtype="float64"))
-    await pipeline.read(
-        [(store_path, spec, chunk_sel, chunk_sel, True)],
-        out,
-    )
-
-    np.testing.assert_array_equal(data, out.as_numpy_array())
diff --git a/tests/test_pipeline_benchmark.py b/tests/test_pipeline_benchmark.py
deleted file mode 100644
index 5d05190a95..0000000000
--- a/tests/test_pipeline_benchmark.py
+++ /dev/null
@@ -1,168 +0,0 @@
-"""Benchmark comparing BatchedCodecPipeline vs PhasedCodecPipeline.
-
-Run with: hatch run test.py3.12-minimal:pytest tests/test_pipeline_benchmark.py -v --benchmark-enable
-"""
-
-from __future__ import annotations
-
-from enum import Enum
-from typing import TYPE_CHECKING, Any
-
-import numpy as np
-import pytest
-
-from zarr.codecs.bytes import BytesCodec
-from zarr.codecs.gzip import GzipCodec
-from zarr.codecs.sharding import ShardingCodec
-from zarr.core.array_spec import ArrayConfig, ArraySpec
-from zarr.core.buffer import default_buffer_prototype
-from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
-from zarr.core.codec_pipeline import BatchedCodecPipeline, PhasedCodecPipeline
-from zarr.core.dtype import get_data_type_from_native_dtype
-from zarr.core.sync import sync
-from zarr.storage import MemoryStore, StorePath
-
-if TYPE_CHECKING:
-    from zarr.abc.codec import Codec
-
-
-class PipelineKind(Enum):
-    batched = "batched"
-    phased_async = "phased_async"
-    phased_sync = "phased_sync"
-    phased_sync_threaded = "phased_sync_threaded"
-
-
-# 1 MB of float64 = 131072 elements
-CHUNK_ELEMENTS = 1024 * 1024 // 8
-CHUNK_SHAPE = (CHUNK_ELEMENTS,)
-
-
-def _make_spec(shape: tuple[int, ...], dtype: str = "float64") -> ArraySpec:
-    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
-    return ArraySpec(
-        shape=shape,
-        dtype=zdtype,
-        fill_value=zdtype.cast_scalar(0),
-        config=ArrayConfig(order="C", write_empty_chunks=True),
-        prototype=default_buffer_prototype(),
-    )
-
-
-def _build_codecs(
-    compressor: str,
-    serializer: str,
-) -> tuple[Codec, ...]:
-    """Build a codec tuple from human-readable compressor/serializer names."""
-    bb: tuple[Codec, ...] = ()
-    if compressor == "gzip":
-        bb = (GzipCodec(level=1),)
-
-    if serializer == "sharding":
-        # 4 inner chunks per shard
-        inner_chunk = (CHUNK_ELEMENTS // 4,)
-        inner_codecs: list[Codec] = [BytesCodec()]
-        if bb:
-            inner_codecs.extend(bb)
-        return (ShardingCodec(chunk_shape=inner_chunk, codecs=inner_codecs),)
-    else:
-        return (BytesCodec(), *bb)
-
-
-def _make_pipeline(
-    kind: PipelineKind,
-    codecs: tuple[Codec, ...],
-    spec: ArraySpec,
-) -> BatchedCodecPipeline | PhasedCodecPipeline:
-    if kind == PipelineKind.batched:
-        pipeline = BatchedCodecPipeline.from_codecs(codecs)
-        # Work around generator-consumption bug in codecs_from_list
-        evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=spec) for c in pipeline)
-        return BatchedCodecPipeline.from_codecs(evolved_codecs)
-    else:  # phased_async, phased_sync, phased_sync_threaded
-        pipeline = PhasedCodecPipeline.from_codecs(codecs)  # type: ignore[assignment]
-        return pipeline.evolve_from_array_spec(spec)
-
-
-def _write_and_read(
-    pipeline: BatchedCodecPipeline | PhasedCodecPipeline,
-    store: MemoryStore,
-    spec: ArraySpec,
-    data: np.ndarray[Any, np.dtype[Any]],
-    kind: PipelineKind,
-    n_chunks: int = 1,
-) -> None:
-    """Write data as n_chunks, then read it all back."""
-    chunk_size = data.shape[0] // n_chunks
-    chunk_shape = (chunk_size,)
-    chunk_spec = _make_spec(chunk_shape, dtype=str(data.dtype))
-
-    # Build batch info for all chunks
-    write_batch: list[tuple[Any, ...]] = []
-    for i in range(n_chunks):
-        store_path = StorePath(store, f"c/{i}")
-        chunk_sel = (slice(0, chunk_size),)
-        out_sel = (slice(i * chunk_size, (i + 1) * chunk_size),)
-        write_batch.append((store_path, chunk_spec, chunk_sel, out_sel, True))
-
-    value = CPUNDBuffer.from_numpy_array(data)
-
-    if kind == PipelineKind.phased_sync:
-        assert isinstance(pipeline, PhasedCodecPipeline)
-        pipeline.write_sync(write_batch, value)
-        out = CPUNDBuffer.from_numpy_array(np.empty_like(data))
-        pipeline.read_sync(write_batch, out)
-    elif kind == PipelineKind.phased_sync_threaded:
-        assert isinstance(pipeline, PhasedCodecPipeline)
-        pipeline.write_sync(write_batch, value, n_workers=4)
-        out = CPUNDBuffer.from_numpy_array(np.empty_like(data))
-        pipeline.read_sync(write_batch, out, n_workers=4)
-    else:
-        sync(pipeline.write(write_batch, value))
-        out = CPUNDBuffer.from_numpy_array(np.empty_like(data))
-        sync(pipeline.read(write_batch, out))
-
-
-@pytest.mark.benchmark(group="pipeline")
-@pytest.mark.parametrize(
-    "kind",
-    [
-        PipelineKind.batched,
-        PipelineKind.phased_async,
-        PipelineKind.phased_sync,
-        PipelineKind.phased_sync_threaded,
-    ],
-    ids=["batched", "phased-async", "phased-sync", "phased-sync-threaded"],
-)
-@pytest.mark.parametrize("compressor", ["none", "gzip"], ids=["no-compress", "gzip"])
-@pytest.mark.parametrize("serializer", ["bytes", "sharding"], ids=["bytes", "sharding"])
-@pytest.mark.parametrize("n_chunks", [1, 8], ids=["1chunk", "8chunks"])
-def test_pipeline(
-    benchmark: Any,
-    kind: PipelineKind,
-    compressor: str,
-    serializer: str,
-    n_chunks: int,
-) -> None:
-    """1 MB per chunk, parametrized over pipeline, compressor, serializer, and chunk count."""
-    codecs = _build_codecs(compressor, serializer)
-
-    # Sync paths require SupportsChunkMapping for the BytesCodec-level IO
-    # ShardingCodec now has _decode_sync/_encode_sync but not SupportsChunkMapping
-    if serializer == "sharding" and kind in (
-        PipelineKind.phased_sync,
-        PipelineKind.phased_sync_threaded,
-    ):
-        pytest.skip("Sync IO path not yet implemented for ShardingCodec")
-
-    # Threading only helps with multiple chunks
-    if kind == PipelineKind.phased_sync_threaded and n_chunks == 1:
-        pytest.skip("Threading with 1 chunk has no benefit")
-
-    total_elements = CHUNK_ELEMENTS * n_chunks
-    spec = _make_spec((total_elements,))
-    data = np.random.default_rng(42).random(total_elements)
-    store = MemoryStore()
-    pipeline = _make_pipeline(kind, codecs, _make_spec(CHUNK_SHAPE))
-
-    benchmark(_write_and_read, pipeline, store, spec, data, kind, n_chunks)

From 88eac8fbe1c473b97a4ffb4b4290dc4d2e7ded1e Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 30 Apr 2026 21:29:42 -0400
Subject: [PATCH 14/22] refactor: rename SyncCodecPipeline to
 FusedCodecPipeline

The new name describes what the pipeline does (fuses fetch+decode+scatter
into one task per chunk) rather than the implementation detail of using
sync codec entry points. The name also stays accurate when this pipeline
gains a remote-store / async fast path in a future change.

Mechanical rename across the class, register_pipeline call, dotted-path
strings used by zarr.config, isinstance checks, parametrize values, and
docstrings. tests/test_sync_pipeline.py renamed to tests/test_fused_pipeline.py.

Nothing on this branch is released, so no deprecation alias is needed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/core/codec_pipeline.py               | 22 +++++-----
 tests/test_codec_invariants.py                | 14 +++----
 tests/test_codec_pipeline.py                  |  2 +-
 ...ync_pipeline.py => test_fused_pipeline.py} | 40 +++++++++----------
 tests/test_pipeline_parity.py                 | 22 +++++-----
 5 files changed, 48 insertions(+), 52 deletions(-)
 rename tests/{test_sync_pipeline.py => test_fused_pipeline.py} (93%)

diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index d6eebbb381..65564e3db6 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -7,8 +7,6 @@
 from typing import TYPE_CHECKING, Any, cast
 from warnings import warn
 
-import numpy as np
-
 from zarr.abc.codec import (
     ArrayArrayCodec,
     ArrayBytesCodec,
@@ -20,8 +18,6 @@
     GetResult,
     SupportsSyncCodec,
 )
-from zarr.core.array_spec import ArraySpec
-from zarr.core.buffer import numpy_buffer_prototype
 from zarr.core.common import concurrent_map
 from zarr.core.config import config
 from zarr.core.indexing import SelectorTuple, is_scalar
@@ -33,6 +29,7 @@
     from typing import Self
 
     from zarr.abc.store import ByteGetter, ByteSetter
+    from zarr.core.array_spec import ArraySpec
     from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
     from zarr.core.metadata.v3 import ChunkGridMetadata
@@ -213,10 +210,9 @@ def decode_chunk(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
         ----------
         chunk_bytes : Buffer
             The encoded chunk bytes.
-        chunk_shape : tuple[int, ...] or None
-            The shape of this chunk. If None, uses the shape from the
-            ArraySpec provided at construction. Required for rectilinear
-            grids where chunks have different shapes.
+        chunk_spec : ArraySpec
+            The array spec describing shape, dtype, fill value, and codec
+            configuration for this chunk.
         """
         aa_specs, ab_spec = self._resolve_specs(chunk_spec)
 
@@ -240,9 +236,9 @@ def encode_chunk(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> Buffer |
         ----------
         chunk_array : NDBuffer
             The chunk data to encode.
-        chunk_shape : tuple[int, ...] or None
-            The shape of this chunk. If None, uses the shape from the
-            ArraySpec provided at construction.
+        chunk_spec : ArraySpec
+            The array spec describing shape, dtype, fill value, and codec
+            configuration for this chunk.
         """
         aa_specs, ab_spec = self._resolve_specs(chunk_spec)
 
@@ -785,7 +781,7 @@ def codecs_from_list(
 
 
 @dataclass(frozen=True)
-class SyncCodecPipeline(CodecPipeline):
+class FusedCodecPipeline(CodecPipeline):
     """Codec pipeline that uses the codec chain directly.
 
     Separates IO from compute without an intermediate layout abstraction.
@@ -1284,4 +1280,4 @@ async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> Non
         )
 
 
-register_pipeline(SyncCodecPipeline)
+register_pipeline(FusedCodecPipeline)
diff --git a/tests/test_codec_invariants.py b/tests/test_codec_invariants.py
index 5ddf4cfd93..c4862b2b47 100644
--- a/tests/test_codec_invariants.py
+++ b/tests/test_codec_invariants.py
@@ -33,7 +33,7 @@
 from zarr.codecs.zstd import ZstdCodec
 from zarr.core.array_spec import ArrayConfig, ArraySpec
 from zarr.core.buffer import Buffer, default_buffer_prototype
-from zarr.core.codec_pipeline import ChunkTransform, SyncCodecPipeline
+from zarr.core.codec_pipeline import ChunkTransform, FusedCodecPipeline
 from zarr.core.dtype import get_data_type_from_native_dtype
 from zarr.storage import LocalStore, MemoryStore
 
@@ -165,11 +165,11 @@ def test_C3_pipeline_methods_do_not_isinstance_check_sharding_codec() -> None:
     import inspect
     import re
 
-    from zarr.core.codec_pipeline import BatchedCodecPipeline, SyncCodecPipeline
+    from zarr.core.codec_pipeline import BatchedCodecPipeline, FusedCodecPipeline
 
     pattern = re.compile(r"isinstance\s*\([^)]*ShardingCodec[^)]*\)")
 
-    for cls in (SyncCodecPipeline, BatchedCodecPipeline):
+    for cls in (FusedCodecPipeline, BatchedCodecPipeline):
         for method_name in ("read", "write", "read_sync", "write_sync"):
             method = getattr(cls, method_name, None)
             if method is None:
@@ -230,16 +230,16 @@ def test_S2_empty_shard_deleted_after_partial_writes_to_fill() -> None:
 
 
 def _is_sync_pipeline_default() -> bool:
-    """Check whether SyncCodecPipeline is the active pipeline."""
+    """Check whether FusedCodecPipeline is the active pipeline."""
     store = MemoryStore()
     arr = zarr.create_array(store=store, shape=(8,), chunks=(8,), dtype="uint8", fill_value=0)
-    return isinstance(arr._async_array.codec_pipeline, SyncCodecPipeline)
+    return isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline)
 
 
 def test_S3_byte_range_path_skipped_when_write_empty_chunks_false() -> None:
     """S3: under default config, partial shard writes do not call set_range_sync."""
     if not _is_sync_pipeline_default():
-        pytest.skip("byte-range fast path is specific to SyncCodecPipeline")
+        pytest.skip("byte-range fast path is specific to FusedCodecPipeline")
 
     store = MemoryStore()
     arr = zarr.create_array(
@@ -264,7 +264,7 @@ def test_S3_byte_range_path_skipped_when_write_empty_chunks_false() -> None:
 def test_S3_byte_range_path_used_when_write_empty_chunks_true() -> None:
     """S3: with write_empty_chunks=True, partial shard writes use set_range_sync."""
     if not _is_sync_pipeline_default():
-        pytest.skip("byte-range fast path is specific to SyncCodecPipeline")
+        pytest.skip("byte-range fast path is specific to FusedCodecPipeline")
 
     store = MemoryStore()
     arr = zarr.create_array(
diff --git a/tests/test_codec_pipeline.py b/tests/test_codec_pipeline.py
index 015a98c495..8ed58f1a23 100644
--- a/tests/test_codec_pipeline.py
+++ b/tests/test_codec_pipeline.py
@@ -26,7 +26,7 @@ def _enable_rectilinear_chunks() -> Generator[None]:
 
 pipeline_paths = [
     "zarr.core.codec_pipeline.BatchedCodecPipeline",
-    "zarr.core.codec_pipeline.SyncCodecPipeline",
+    "zarr.core.codec_pipeline.FusedCodecPipeline",
 ]
 
 
diff --git a/tests/test_sync_pipeline.py b/tests/test_fused_pipeline.py
similarity index 93%
rename from tests/test_sync_pipeline.py
rename to tests/test_fused_pipeline.py
index 1df182b9c5..4268cf4f86 100644
--- a/tests/test_sync_pipeline.py
+++ b/tests/test_fused_pipeline.py
@@ -1,4 +1,4 @@
-"""Tests for SyncCodecPipeline -- the sync-first codec pipeline."""
+"""Tests for FusedCodecPipeline -- the per-chunk-fused codec pipeline."""
 
 from __future__ import annotations
 
@@ -14,7 +14,7 @@
 from zarr.codecs.transpose import TransposeCodec
 from zarr.codecs.zstd import ZstdCodec
 from zarr.core.buffer import cpu
-from zarr.core.codec_pipeline import SyncCodecPipeline
+from zarr.core.codec_pipeline import FusedCodecPipeline
 from zarr.storage import MemoryStore, StorePath
 
 
@@ -25,11 +25,11 @@ def _create_array(
     codecs: tuple[Any, ...] = (BytesCodec(),),
     fill_value: object = 0,
 ) -> zarr.Array[Any]:
-    """Create a zarr array using SyncCodecPipeline."""
+    """Create a zarr array using FusedCodecPipeline."""
     if chunks is None:
         chunks = shape
 
-    _ = SyncCodecPipeline.from_codecs(codecs)
+    _ = FusedCodecPipeline.from_codecs(codecs)
 
     return zarr.create_array(
         StorePath(MemoryStore()),
@@ -55,8 +55,8 @@ def _create_array(
     ids=["bytes-only", "gzip", "zstd", "transpose", "transpose+zstd"],
 )
 def test_construction(codecs: tuple[Any, ...]) -> None:
-    """SyncCodecPipeline can be constructed from valid codec combinations."""
-    pipeline = SyncCodecPipeline.from_codecs(codecs)
+    """FusedCodecPipeline can be constructed from valid codec combinations."""
+    pipeline = FusedCodecPipeline.from_codecs(codecs)
     assert pipeline.codecs == codecs
 
 
@@ -66,7 +66,7 @@ def test_evolve_from_array_spec() -> None:
     from zarr.core.buffer import default_buffer_prototype
     from zarr.core.dtype import get_data_type_from_native_dtype
 
-    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
     assert pipeline._sync_transform is None
 
     zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
@@ -92,7 +92,7 @@ def test_evolve_from_array_spec() -> None:
     ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
 )
 def test_read_write_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
-    """Data written through SyncCodecPipeline can be read back correctly via async path."""
+    """Data written through FusedCodecPipeline can be read back correctly via async path."""
     from zarr.core.array_spec import ArrayConfig, ArraySpec
     from zarr.core.buffer import default_buffer_prototype
     from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
@@ -109,7 +109,7 @@ def test_read_write_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
         prototype=default_buffer_prototype(),
     )
 
-    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
     pipeline = pipeline.evolve_from_array_spec(spec)
 
     # Write
@@ -156,7 +156,7 @@ def test_read_missing_chunk_fills() -> None:
         prototype=default_buffer_prototype(),
     )
 
-    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
     pipeline = pipeline.evolve_from_array_spec(spec)
 
     out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
@@ -205,7 +205,7 @@ def test_read_write_sync_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
         prototype=default_buffer_prototype(),
     )
 
-    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
     pipeline = pipeline.evolve_from_array_spec(spec)
 
     data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
@@ -247,7 +247,7 @@ def test_read_sync_missing_chunk_fills() -> None:
         prototype=default_buffer_prototype(),
     )
 
-    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
     pipeline = pipeline.evolve_from_array_spec(spec)
 
     out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
@@ -280,7 +280,7 @@ def test_sync_write_async_read_roundtrip() -> None:
         prototype=default_buffer_prototype(),
     )
 
-    pipeline = SyncCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
     pipeline = pipeline.evolve_from_array_spec(spec)
 
     data = np.arange(100, dtype="float64")
@@ -311,7 +311,7 @@ def test_sync_transform_encode_decode_roundtrip() -> None:
     from zarr.core.dtype import Float64
 
     codecs = (BytesCodec(),)
-    pipeline = SyncCodecPipeline.from_codecs(codecs)
+    pipeline = FusedCodecPipeline.from_codecs(codecs)
     zdtype = Float64()
     spec = ArraySpec(
         shape=(100,),
@@ -517,7 +517,7 @@ def test_partial_shard_write_roundtrip_correctness() -> None:
 def test_partial_shard_write_uses_set_range() -> None:
     """Partial shard writes with fixed-size codecs should use set_range_sync.
 
-    Only the SyncCodecPipeline uses byte-range writes for partial shard
+    Only the FusedCodecPipeline uses byte-range writes for partial shard
     updates; skipped under other pipelines.
     """
     from unittest.mock import patch
@@ -536,8 +536,8 @@ def test_partial_shard_write_uses_set_range() -> None:
         fill_value=0.0,
         config={"write_empty_chunks": True},
     )
-    if not isinstance(arr._async_array.codec_pipeline, SyncCodecPipeline):
-        pytest.skip("byte-range write optimization is specific to SyncCodecPipeline")
+    if not isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline):
+        pytest.skip("byte-range write optimization is specific to FusedCodecPipeline")
 
     # Initial full write to create the shard blob
     arr[:] = np.arange(100, dtype="float64")
@@ -560,7 +560,7 @@ def test_partial_shard_write_uses_set_range() -> None:
 def test_partial_shard_write_falls_back_for_compressed() -> None:
     """Partial shard writes with compressed inner codecs should NOT use set_range.
 
-    Only meaningful under SyncCodecPipeline (which can use byte-range writes
+    Only meaningful under FusedCodecPipeline (which can use byte-range writes
     for fixed-size inner codecs). Other pipelines never use set_range_sync,
     so the assertion is trivially true and the test is uninformative.
     """
@@ -576,8 +576,8 @@ def test_partial_shard_write_falls_back_for_compressed() -> None:
         compressors=GzipCodec(),
         fill_value=0.0,
     )
-    if not isinstance(arr._async_array.codec_pipeline, SyncCodecPipeline):
-        pytest.skip("byte-range write optimization is specific to SyncCodecPipeline")
+    if not isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline):
+        pytest.skip("byte-range write optimization is specific to FusedCodecPipeline")
     arr[:] = np.arange(100, dtype="float64")
 
     with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock_set_range:
diff --git a/tests/test_pipeline_parity.py b/tests/test_pipeline_parity.py
index 3352966d8a..14dbcc4495 100644
--- a/tests/test_pipeline_parity.py
+++ b/tests/test_pipeline_parity.py
@@ -1,7 +1,7 @@
 """Pipeline parity test — exhaustive matrix of read/write scenarios.
 
 For every cell of the matrix (codec config x layout x operation
-sequence x runtime config), assert that ``SyncCodecPipeline`` and
+sequence x runtime config), assert that ``FusedCodecPipeline`` and
 ``BatchedCodecPipeline`` produce semantically identical results:
 
   * Same returned array contents on read.
@@ -240,7 +240,7 @@ def _read_under_pipeline(pipeline_path: str, store: MemoryStore) -> Any:
 
 
 _BATCHED = "zarr.core.codec_pipeline.BatchedCodecPipeline"
-_SYNC = "zarr.core.codec_pipeline.SyncCodecPipeline"
+_FUSED = "zarr.core.codec_pipeline.FusedCodecPipeline"
 
 
 @pytest.mark.parametrize(
@@ -253,7 +253,7 @@ def test_pipeline_parity(
     sequence_fn: Callable[[tuple[int, ...]], list[WriteOp]],
     write_empty_chunks: bool,
 ) -> None:
-    """SyncCodecPipeline must be semantically identical to BatchedCodecPipeline.
+    """FusedCodecPipeline must be semantically identical to BatchedCodecPipeline.
 
     Three checks, in order of decreasing diagnostic value:
 
@@ -274,14 +274,14 @@ def test_pipeline_parity(
         _BATCHED, codec_kwargs, layout, sequence, write_empty_chunks
     )
     sync_store, sync_arr = _write_under_pipeline(
-        _SYNC, codec_kwargs, layout, sequence, write_empty_chunks
+        _FUSED, codec_kwargs, layout, sequence, write_empty_chunks
     )
 
     # 1. Array contents must agree.
     np.testing.assert_array_equal(
         sync_arr,
         batched_arr,
-        err_msg="SyncCodecPipeline returned different array contents than BatchedCodecPipeline",
+        err_msg="FusedCodecPipeline returned different array contents than BatchedCodecPipeline",
     )
 
     # 2. Store key sets must agree.
@@ -294,17 +294,17 @@ def test_pipeline_parity(
     )
 
     # 3. Cross-read: each pipeline must correctly read the other's output.
-    sync_reads_batched = _read_under_pipeline(_SYNC, batched_store)
+    sync_reads_batched = _read_under_pipeline(_FUSED, batched_store)
     batched_reads_sync = _read_under_pipeline(_BATCHED, sync_store)
     np.testing.assert_array_equal(
         sync_reads_batched,
         batched_arr,
-        err_msg="SyncCodecPipeline could not correctly read BatchedCodecPipeline's output",
+        err_msg="FusedCodecPipeline could not correctly read BatchedCodecPipeline's output",
     )
     np.testing.assert_array_equal(
         batched_reads_sync,
         sync_arr,
-        err_msg="BatchedCodecPipeline could not correctly read SyncCodecPipeline's output",
+        err_msg="BatchedCodecPipeline could not correctly read FusedCodecPipeline's output",
     )
 
 
@@ -356,7 +356,7 @@ def test_pipeline_read_parity(
     layout: LayoutConfig,
     selection: Any,
 ) -> None:
-    """Partial reads via SyncCodecPipeline must match BatchedCodecPipeline.
+    """Partial reads via FusedCodecPipeline must match BatchedCodecPipeline.
 
     The full-write/full-read parity test above doesn't exercise partial
     reads (e.g. a single element from a sharded array), which take a
@@ -372,14 +372,14 @@ def test_pipeline_read_parity(
 
     with zarr_config.set({"codec_pipeline.path": _BATCHED}):
         batched_arr = zarr.open_array(store=store, mode="r")[selection]
-    with zarr_config.set({"codec_pipeline.path": _SYNC}):
+    with zarr_config.set({"codec_pipeline.path": _FUSED}):
         sync_arr = zarr.open_array(store=store, mode="r")[selection]
 
     np.testing.assert_array_equal(
         sync_arr,
         batched_arr,
         err_msg=(
-            f"SyncCodecPipeline read returned different result than BatchedCodecPipeline "
+            f"FusedCodecPipeline read returned different result than BatchedCodecPipeline "
             f"for selection {selection!r}"
         ),
     )

From c3d11d026a1f9cb07bdd7793aa574d6aca9e11f8 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 30 Apr 2026 21:38:46 -0400
Subject: [PATCH 15/22] refactor: lift _merge_chunk_array to module level

The BatchedCodecPipeline and FusedCodecPipeline classes had identical
copies of _merge_chunk_array (one method, one staticmethod). Extract
once as a module-level free function and call from both. No new base
class or mixin is introduced.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/core/codec_pipeline.py | 139 ++++++++++++--------------------
 1 file changed, 51 insertions(+), 88 deletions(-)

diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 65564e3db6..1b9b8bf62b 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -127,6 +127,54 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
         return fill_value
 
 
+def _merge_chunk_array(
+    existing_chunk_array: NDBuffer | None,
+    value: NDBuffer,
+    out_selection: SelectorTuple,
+    chunk_spec: ArraySpec,
+    chunk_selection: SelectorTuple,
+    is_complete_chunk: bool,
+    drop_axes: tuple[int, ...],
+) -> NDBuffer:
+    """Merge ``value`` into a full-chunk-shaped NDBuffer at ``chunk_selection``.
+
+    If ``is_complete_chunk`` and ``value`` already covers the full chunk
+    shape, ``value`` is returned directly (no copy). Otherwise, a writable
+    buffer is materialized — either from ``existing_chunk_array.copy()`` if
+    one was read from the store, or freshly allocated and filled with the
+    chunk's fill value — and the relevant slice of ``value`` is written into it.
+    """
+    if (
+        is_complete_chunk
+        and value.shape == chunk_spec.shape
+        # Guard that this is not a partial chunk at the end with is_complete_chunk=True
+        and value[out_selection].shape == chunk_spec.shape
+    ):
+        return value
+    if existing_chunk_array is None:
+        chunk_array = chunk_spec.prototype.nd_buffer.create(
+            shape=chunk_spec.shape,
+            dtype=chunk_spec.dtype.to_native_dtype(),
+            order=chunk_spec.order,
+            fill_value=fill_value_or_default(chunk_spec),
+        )
+    else:
+        chunk_array = existing_chunk_array.copy()
+    if chunk_selection == () or is_scalar(
+        value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
+    ):
+        chunk_value = value
+    else:
+        chunk_value = value[out_selection]
+        if drop_axes:
+            item = tuple(
+                None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim)
+            )
+            chunk_value = chunk_value[item]
+    chunk_array[chunk_selection] = chunk_value
+    return chunk_array
+
+
 @dataclass(slots=True, kw_only=True)
 class ChunkTransform:
     """A synchronous codec chain.
@@ -505,50 +553,6 @@ async def read_batch(
                     results.append(GetResult(status="missing"))
         return tuple(results)
 
-    def _merge_chunk_array(
-        self,
-        existing_chunk_array: NDBuffer | None,
-        value: NDBuffer,
-        out_selection: SelectorTuple,
-        chunk_spec: ArraySpec,
-        chunk_selection: SelectorTuple,
-        is_complete_chunk: bool,
-        drop_axes: tuple[int, ...],
-    ) -> NDBuffer:
-        if (
-            is_complete_chunk
-            and value.shape == chunk_spec.shape
-            # Guard that this is not a partial chunk at the end with is_complete_chunk=True
-            and value[out_selection].shape == chunk_spec.shape
-        ):
-            return value
-        if existing_chunk_array is None:
-            chunk_array = chunk_spec.prototype.nd_buffer.create(
-                shape=chunk_spec.shape,
-                dtype=chunk_spec.dtype.to_native_dtype(),
-                order=chunk_spec.order,
-                fill_value=fill_value_or_default(chunk_spec),
-            )
-        else:
-            chunk_array = existing_chunk_array.copy()  # make a writable copy
-        if chunk_selection == () or is_scalar(
-            value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
-        ):
-            chunk_value = value
-        else:
-            chunk_value = value[out_selection]
-            # handle missing singleton dimensions
-            if drop_axes:
-                item = tuple(
-                    None  # equivalent to np.newaxis
-                    if idx in drop_axes
-                    else slice(None)
-                    for idx in range(chunk_spec.ndim)
-                )
-                chunk_value = chunk_value[item]
-        chunk_array[chunk_selection] = chunk_value
-        return chunk_array
-
     async def write_batch(
         self,
         batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
@@ -603,7 +607,7 @@ async def _read_key(
             )
 
             chunk_array_merged = [
-                self._merge_chunk_array(
+                _merge_chunk_array(
                     chunk_array,
                     value,
                     out_selection,
@@ -907,47 +911,6 @@ async def encode(
             )
         return chunk_bytes_batch
 
-    # -- merge helper --
-
-    @staticmethod
-    def _merge_chunk_array(
-        existing_chunk_array: NDBuffer | None,
-        value: NDBuffer,
-        out_selection: SelectorTuple,
-        chunk_spec: ArraySpec,
-        chunk_selection: SelectorTuple,
-        is_complete_chunk: bool,
-        drop_axes: tuple[int, ...],
-    ) -> NDBuffer:
-        if (
-            is_complete_chunk
-            and value.shape == chunk_spec.shape
-            and value[out_selection].shape == chunk_spec.shape
-        ):
-            return value
-        if existing_chunk_array is None:
-            chunk_array = chunk_spec.prototype.nd_buffer.create(
-                shape=chunk_spec.shape,
-                dtype=chunk_spec.dtype.to_native_dtype(),
-                order=chunk_spec.order,
-                fill_value=fill_value_or_default(chunk_spec),
-            )
-        else:
-            chunk_array = existing_chunk_array.copy()
-        if chunk_selection == () or is_scalar(
-            value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
-        ):
-            chunk_value = value
-        else:
-            chunk_value = value[out_selection]
-            if drop_axes:
-                item = tuple(
-                    None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim)
-                )
-                chunk_value = chunk_value[item]
-        chunk_array[chunk_selection] = chunk_value
-        return chunk_array
-
     # -- sync read/write --
 
     def read_sync(
@@ -1097,7 +1060,7 @@ def _write_one(
             if existing_bytes is not None:
                 existing_chunk_array = transform.decode_chunk(existing_bytes, chunk_spec)
 
-            chunk_array = self._merge_chunk_array(
+            chunk_array = _merge_chunk_array(
                 existing_chunk_array,
                 value,
                 out_selection,
@@ -1228,7 +1191,7 @@ async def _read_key(
         )
 
         chunk_array_merged = [
-            self._merge_chunk_array(
+            _merge_chunk_array(
                 chunk_array,
                 value,
                 out_selection,

From 71f0d32d585ba4c0e900c6d89f057acb11c1d4fa Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 30 Apr 2026 21:43:00 -0400
Subject: [PATCH 16/22] refactor: extract _async_read_fallback to module level

Both BatchedCodecPipeline.read_batch (non-partial-decode branch) and
FusedCodecPipeline.read (async fallback) duplicate the same sequence:
concurrent_map(get) -> pipeline.decode -> scatter into out. Lift to a
module-level free function and call from both.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/core/codec_pipeline.py | 105 ++++++++++++++------------------
 1 file changed, 47 insertions(+), 58 deletions(-)

diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 1b9b8bf62b..42e9fe23e3 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -175,6 +175,49 @@ def _merge_chunk_array(
     return chunk_array
 
 
+async def _async_read_fallback(
+    pipeline: CodecPipeline,
+    batch: list[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+    out: NDBuffer,
+    drop_axes: tuple[int, ...],
+) -> tuple[GetResult, ...]:
+    """Async fallback read used when no fast-path is available.
+
+    Fetches every chunk's bytes via ``concurrent_map`` (sized by
+    ``async.concurrency``), decodes the batch through ``pipeline.decode``,
+    then scatters each decoded chunk into ``out`` at its ``out_selection``.
+
+    Used by both ``BatchedCodecPipeline.read_batch`` (non-partial-decode
+    branch) and ``FusedCodecPipeline.read`` (when the store is not a
+    ``SupportsGetSync`` / sync transform is unavailable).
+    """
+    chunk_bytes_batch = await concurrent_map(
+        [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch],
+        lambda byte_getter, prototype: byte_getter.get(prototype),
+        config.get("async.concurrency"),
+    )
+    chunk_array_batch = await pipeline.decode(
+        [
+            (chunk_bytes, chunk_spec)
+            for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+        ],
+    )
+    results: list[GetResult] = []
+    for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
+        chunk_array_batch, batch, strict=False
+    ):
+        if chunk_array is not None:
+            tmp = chunk_array[chunk_selection]
+            if drop_axes:
+                tmp = tmp.squeeze(axis=drop_axes)
+            out[out_selection] = tmp
+            results.append(GetResult(status="present"))
+        else:
+            out[out_selection] = fill_value_or_default(chunk_spec)
+            results.append(GetResult(status="missing"))
+    return tuple(results)
+
+
 @dataclass(slots=True, kw_only=True)
 class ChunkTransform:
     """A synchronous codec chain.
@@ -501,8 +544,8 @@ async def read_batch(
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> tuple[GetResult, ...]:
-        results: list[GetResult] = []
         if self.supports_partial_decode:
+            results: list[GetResult] = []
             batch_info_list = list(batch_info)
             chunk_array_batch = await self.decode_partial_batch(
                 [
@@ -521,37 +564,8 @@ async def read_batch(
                 else:
                     out[out_selection] = fill_value_or_default(chunk_spec)
                     results.append(GetResult(status="missing"))
-        else:
-            batch_info_list = list(batch_info)
-            chunk_bytes_batch = await concurrent_map(
-                [
-                    (byte_getter, array_spec.prototype)
-                    for byte_getter, array_spec, *_ in batch_info_list
-                ],
-                lambda byte_getter, prototype: byte_getter.get(prototype),
-                config.get("async.concurrency"),
-            )
-            chunk_array_batch = await self.decode_batch(
-                [
-                    (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, *_) in zip(
-                        chunk_bytes_batch, batch_info_list, strict=False
-                    )
-                ],
-            )
-            for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
-                chunk_array_batch, batch_info_list, strict=False
-            ):
-                if chunk_array is not None:
-                    tmp = chunk_array[chunk_selection]
-                    if drop_axes:
-                        tmp = tmp.squeeze(axis=drop_axes)
-                    out[out_selection] = tmp
-                    results.append(GetResult(status="present"))
-                else:
-                    out[out_selection] = fill_value_or_default(chunk_spec)
-                    results.append(GetResult(status="missing"))
-        return tuple(results)
+            return tuple(results)
+        return await _async_read_fallback(self, list(batch_info), out, drop_axes)
 
     async def write_batch(
         self,
@@ -1113,32 +1127,7 @@ async def read(
         ):
             return self.read_sync(batch, out, drop_axes, max_workers=_resolve_max_workers())
 
-        # Async fallback: fetch all chunks, decode via async codec API, scatter
-        chunk_bytes_batch = await concurrent_map(
-            [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch],
-            lambda byte_getter, prototype: byte_getter.get(prototype),
-            config.get("async.concurrency"),
-        )
-        chunk_array_batch = await self.decode(
-            [
-                (chunk_bytes, chunk_spec)
-                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
-            ],
-        )
-        results: list[GetResult] = []
-        for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
-            chunk_array_batch, batch, strict=False
-        ):
-            if chunk_array is not None:
-                tmp = chunk_array[chunk_selection]
-                if drop_axes:
-                    tmp = tmp.squeeze(axis=drop_axes)
-                out[out_selection] = tmp
-                results.append(GetResult(status="present"))
-            else:
-                out[out_selection] = fill_value_or_default(chunk_spec)
-                results.append(GetResult(status="missing"))
-        return tuple(results)
+        return await _async_read_fallback(self, batch, out, drop_axes)
 
     async def write(
         self,

From f463035ca0bef71553876fbaf563d4cc3aca6339 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 30 Apr 2026 21:47:59 -0400
Subject: [PATCH 17/22] refactor: extract _async_write_fallback to module level

Both BatchedCodecPipeline.write_batch (non-partial-encode branch) and
FusedCodecPipeline.write (async fallback) duplicate the same sequence:
read existing bytes -> decode -> merge -> encode -> set/delete. Lift to
a module-level free function and call from both.

After this change, neither pipeline class carries _merge_chunk_array,
nor the duplicated read/write fallback bodies. Each class is reduced to
its constructor, fast-path methods, and thin async dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/core/codec_pipeline.py | 268 ++++++++++++--------------------
 1 file changed, 103 insertions(+), 165 deletions(-)

diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 42e9fe23e3..49f3320c15 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -218,6 +218,106 @@ async def _async_read_fallback(
     return tuple(results)
 
 
+async def _async_write_fallback(
+    pipeline: CodecPipeline,
+    batch: list[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+    value: NDBuffer,
+    drop_axes: tuple[int, ...],
+) -> None:
+    """Async fallback write used when no fast-path is available.
+
+    For each chunk in ``batch``: read its existing bytes from the store
+    (skipping the read for complete chunks), decode the batch via
+    ``pipeline.decode``, merge ``value`` into each decoded chunk via
+    ``_merge_chunk_array``, drop chunks that are all-fill when
+    ``write_empty_chunks`` is False, encode the surviving chunks via
+    ``pipeline.encode``, then ``set`` the encoded bytes (or ``delete``
+    if encoding produced ``None`` or the chunk dropped).
+
+    Used by both ``BatchedCodecPipeline.write_batch`` (non-partial-encode
+    branch) and ``FusedCodecPipeline.write`` (when the store is not a
+    ``SupportsSetSync`` / sync transform is unavailable).
+    """
+
+    async def _read_key(
+        byte_setter: ByteSetter | None, prototype: BufferPrototype
+    ) -> Buffer | None:
+        if byte_setter is None:
+            return None
+        return await byte_setter.get(prototype=prototype)
+
+    chunk_bytes_batch: Iterable[Buffer | None]
+    chunk_bytes_batch = await concurrent_map(
+        [
+            (
+                None if is_complete_chunk else byte_setter,
+                chunk_spec.prototype,
+            )
+            for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch
+        ],
+        _read_key,
+        config.get("async.concurrency"),
+    )
+    chunk_array_decoded = await pipeline.decode(
+        [
+            (chunk_bytes, chunk_spec)
+            for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+        ],
+    )
+
+    chunk_array_merged = [
+        _merge_chunk_array(
+            chunk_array,
+            value,
+            out_selection,
+            chunk_spec,
+            chunk_selection,
+            is_complete_chunk,
+            drop_axes,
+        )
+        for chunk_array, (
+            _,
+            chunk_spec,
+            chunk_selection,
+            out_selection,
+            is_complete_chunk,
+        ) in zip(chunk_array_decoded, batch, strict=False)
+    ]
+    chunk_array_batch: list[NDBuffer | None] = []
+    for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch, strict=False):
+        if chunk_array is None:
+            chunk_array_batch.append(None)  # type: ignore[unreachable]
+        else:
+            if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
+                fill_value_or_default(chunk_spec)
+            ):
+                chunk_array_batch.append(None)
+            else:
+                chunk_array_batch.append(chunk_array)
+
+    chunk_bytes_batch = await pipeline.encode(
+        [
+            (chunk_array, chunk_spec)
+            for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_batch, batch, strict=False)
+        ],
+    )
+
+    async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None:
+        if chunk_bytes is None:
+            await byte_setter.delete()
+        else:
+            await byte_setter.set(chunk_bytes)
+
+    await concurrent_map(
+        [
+            (byte_setter, chunk_bytes)
+            for chunk_bytes, (byte_setter, *_) in zip(chunk_bytes_batch, batch, strict=False)
+        ],
+        _write_key,
+        config.get("async.concurrency"),
+    )
+
+
 @dataclass(slots=True, kw_only=True)
 class ChunkTransform:
     """A synchronous codec chain.
@@ -590,93 +690,8 @@ async def write_batch(
                     ],
                 )
 
-        else:
-            # Read existing bytes if not total slice
-            async def _read_key(
-                byte_setter: ByteSetter | None, prototype: BufferPrototype
-            ) -> Buffer | None:
-                if byte_setter is None:
-                    return None
-                return await byte_setter.get(prototype=prototype)
-
-            chunk_bytes_batch: Iterable[Buffer | None]
-            chunk_bytes_batch = await concurrent_map(
-                [
-                    (
-                        None if is_complete_chunk else byte_setter,
-                        chunk_spec.prototype,
-                    )
-                    for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info
-                ],
-                _read_key,
-                config.get("async.concurrency"),
-            )
-            chunk_array_decoded = await self.decode_batch(
-                [
-                    (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, *_) in zip(
-                        chunk_bytes_batch, batch_info, strict=False
-                    )
-                ],
-            )
-
-            chunk_array_merged = [
-                _merge_chunk_array(
-                    chunk_array,
-                    value,
-                    out_selection,
-                    chunk_spec,
-                    chunk_selection,
-                    is_complete_chunk,
-                    drop_axes,
-                )
-                for chunk_array, (
-                    _,
-                    chunk_spec,
-                    chunk_selection,
-                    out_selection,
-                    is_complete_chunk,
-                ) in zip(chunk_array_decoded, batch_info, strict=False)
-            ]
-            chunk_array_batch: list[NDBuffer | None] = []
-            for chunk_array, (_, chunk_spec, *_) in zip(
-                chunk_array_merged, batch_info, strict=False
-            ):
-                if chunk_array is None:
-                    chunk_array_batch.append(None)  # type: ignore[unreachable]
-                else:
-                    if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
-                        fill_value_or_default(chunk_spec)
-                    ):
-                        chunk_array_batch.append(None)
-                    else:
-                        chunk_array_batch.append(chunk_array)
-
-            chunk_bytes_batch = await self.encode_batch(
-                [
-                    (chunk_array, chunk_spec)
-                    for chunk_array, (_, chunk_spec, *_) in zip(
-                        chunk_array_batch, batch_info, strict=False
-                    )
-                ],
-            )
-
-            async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None:
-                if chunk_bytes is None:
-                    await byte_setter.delete()
-                else:
-                    await byte_setter.set(chunk_bytes)
-
-            await concurrent_map(
-                [
-                    (byte_setter, chunk_bytes)
-                    for chunk_bytes, (byte_setter, *_) in zip(
-                        chunk_bytes_batch, batch_info, strict=False
-                    )
-                ],
-                _write_key,
-                config.get("async.concurrency"),
-            )
+            return
+        await _async_write_fallback(self, list(batch_info), value, drop_axes)
 
     async def decode(
         self,
@@ -1152,84 +1167,7 @@ async def write(
             self.write_sync(batch, value, drop_axes, max_workers=_resolve_max_workers())
             return
 
-        # Async fallback: same pattern as BatchedCodecPipeline.write_batch
-        async def _read_key(
-            byte_setter: ByteSetter | None, prototype: BufferPrototype
-        ) -> Buffer | None:
-            if byte_setter is None:
-                return None
-            return await byte_setter.get(prototype=prototype)
-
-        chunk_bytes_batch: Iterable[Buffer | None]
-        chunk_bytes_batch = await concurrent_map(
-            [
-                (
-                    None if is_complete_chunk else byte_setter,
-                    chunk_spec.prototype,
-                )
-                for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch
-            ],
-            _read_key,
-            config.get("async.concurrency"),
-        )
-        chunk_array_decoded = await self.decode(
-            [
-                (chunk_bytes, chunk_spec)
-                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
-            ],
-        )
-
-        chunk_array_merged = [
-            _merge_chunk_array(
-                chunk_array,
-                value,
-                out_selection,
-                chunk_spec,
-                chunk_selection,
-                is_complete_chunk,
-                drop_axes,
-            )
-            for chunk_array, (
-                _,
-                chunk_spec,
-                chunk_selection,
-                out_selection,
-                is_complete_chunk,
-            ) in zip(chunk_array_decoded, batch, strict=False)
-        ]
-        chunk_array_batch: list[NDBuffer | None] = []
-        for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch, strict=False):
-            if chunk_array is None:
-                chunk_array_batch.append(None)  # type: ignore[unreachable]
-            else:
-                if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
-                    fill_value_or_default(chunk_spec)
-                ):
-                    chunk_array_batch.append(None)
-                else:
-                    chunk_array_batch.append(chunk_array)
-
-        chunk_bytes_batch = await self.encode(
-            [
-                (chunk_array, chunk_spec)
-                for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_batch, batch, strict=False)
-            ],
-        )
-
-        async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None:
-            if chunk_bytes is None:
-                await byte_setter.delete()
-            else:
-                await byte_setter.set(chunk_bytes)
-
-        await concurrent_map(
-            [
-                (byte_setter, chunk_bytes)
-                for chunk_bytes, (byte_setter, *_) in zip(chunk_bytes_batch, batch, strict=False)
-            ],
-            _write_key,
-            config.get("async.concurrency"),
-        )
+        await _async_write_fallback(self, batch, value, drop_axes)
 
 
 register_pipeline(FusedCodecPipeline)

From 621361a02a13b3aaae8f9568d807a11c5087c094 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Thu, 30 Apr 2026 22:24:20 -0400
Subject: [PATCH 18/22] test(bench): parametrize test_e2e over both codec
 pipelines

Adds a `pipeline` fixture with values ["batched", "fused"] that swaps
codec_pipeline.path for the duration of each benchmark. Both
test_write_array and test_read_array now produce one benchmark cell
per (compression x layout x store x pipeline). CodSpeed will report
comparable numbers for both pipelines on the same workloads.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/benchmarks/test_e2e.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/tests/benchmarks/test_e2e.py b/tests/benchmarks/test_e2e.py
index 65d0e65ac9..46f59c73ed 100644
--- a/tests/benchmarks/test_e2e.py
+++ b/tests/benchmarks/test_e2e.py
@@ -9,6 +9,8 @@
 from tests.benchmarks.common import Layout
 
 if TYPE_CHECKING:
+    from collections.abc import Iterator
+
     from pytest_benchmark.fixture import BenchmarkFixture
 
     from zarr.abc.store import Store
@@ -19,6 +21,7 @@
 import pytest
 
 from zarr import create_array
+from zarr.core.config import config as zarr_config
 
 CompressorName = Literal["gzip"] | None
 
@@ -37,12 +40,33 @@
     Layout(shape=(1_000_000,), chunks=(100,), shards=(10000 * 100,)),
 )
 
+_PIPELINE_PATHS = {
+    "batched": "zarr.core.codec_pipeline.BatchedCodecPipeline",
+    "fused": "zarr.core.codec_pipeline.FusedCodecPipeline",
+}
+
+
+@pytest.fixture(params=["batched", "fused"])
+def pipeline(request: pytest.FixtureRequest) -> Iterator[str]:
+    """Set ``codec_pipeline.path`` for the duration of the benchmark.
+
+    Yields the pipeline name so each parametrize cell has a distinct
+    benchmark id.
+    """
+    name = request.param
+    with zarr_config.set({"codec_pipeline.path": _PIPELINE_PATHS[name]}):
+        yield name
+
 
 @pytest.mark.parametrize("compression_name", [None, "gzip"])
 @pytest.mark.parametrize("layout", layouts, ids=str)
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
 def test_write_array(
-    store: Store, layout: Layout, compression_name: CompressorName, benchmark: BenchmarkFixture
+    store: Store,
+    layout: Layout,
+    compression_name: CompressorName,
+    pipeline: str,
+    benchmark: BenchmarkFixture,
 ) -> None:
     """
     Test the time required to fill an array with a single value
@@ -64,7 +88,11 @@ def test_write_array(
 @pytest.mark.parametrize("layout", layouts, ids=str)
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
 def test_read_array(
-    store: Store, layout: Layout, compression_name: CompressorName, benchmark: BenchmarkFixture
+    store: Store,
+    layout: Layout,
+    compression_name: CompressorName,
+    pipeline: str,
+    benchmark: BenchmarkFixture,
 ) -> None:
     """
     Test the time required to fill an array with a single value

From fb812c40421e350e0a402ac9fd42a4056aac562f Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 1 May 2026 06:46:23 -0400
Subject: [PATCH 19/22] test(bench): parametrize test_e2e over a synthetic
 latency dimension
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `latency in {0, 0.001, 0.05, 0.2}` and a bench_store fixture that
wraps the underlying memory store in zarr.testing.store.LatencyStore
when latency > 0. Local-store cells skip nonzero latency — adding
synthetic latency on top of a real filesystem double-counts and is not
the intended measurement.

Combined with the pipeline parameter, the matrix now produces
comparable benchmark numbers for {Batched, Fused} x {0, 1ms, 50ms, 200ms}
on memory-shaped operation. The numbers are signal under one simple
model of remote latency, not absolute predictions of S3 behavior.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/benchmarks/test_e2e.py | 34 ++++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/tests/benchmarks/test_e2e.py b/tests/benchmarks/test_e2e.py
index 46f59c73ed..0e5e88b72c 100644
--- a/tests/benchmarks/test_e2e.py
+++ b/tests/benchmarks/test_e2e.py
@@ -15,6 +15,7 @@
 
     from zarr.abc.store import Store
     from zarr.core.common import NamedConfig
+
 from operator import getitem, setitem
 from typing import Any, Literal
 
@@ -22,6 +23,7 @@
 
 from zarr import create_array
 from zarr.core.config import config as zarr_config
+from zarr.testing.store import LatencyStore
 
 CompressorName = Literal["gzip"] | None
 
@@ -45,6 +47,30 @@
     "fused": "zarr.core.codec_pipeline.FusedCodecPipeline",
 }
 
+_LATENCY_VALUES = (0.0, 0.001, 0.05, 0.2)
+
+
+@pytest.fixture(params=_LATENCY_VALUES, ids=lambda v: f"latency={v}")
+def latency(request: pytest.FixtureRequest) -> float:
+    return request.param  # type: ignore[no-any-return]
+
+
+@pytest.fixture
+def bench_store(store: Store, latency: float, request: pytest.FixtureRequest) -> Store:
+    """Wraps the underlying store in LatencyStore when latency > 0.
+
+    Local-store cases skip nonzero latency — synthetic latency on top of
+    a real LocalStore is double-counting; latency simulation only applies
+    to the in-process memory store.
+    """
+    callspec = getattr(request.node, "callspec", None)
+    store_kind = callspec.params.get("store", "memory") if callspec is not None else "memory"
+    if latency > 0:
+        if store_kind == "local":
+            pytest.skip("latency injection only applies to in-memory store")
+        return LatencyStore(store, get_latency=latency, set_latency=latency)
+    return store
+
 
 @pytest.fixture(params=["batched", "fused"])
 def pipeline(request: pytest.FixtureRequest) -> Iterator[str]:
@@ -62,7 +88,7 @@ def pipeline(request: pytest.FixtureRequest) -> Iterator[str]:
 @pytest.mark.parametrize("layout", layouts, ids=str)
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
 def test_write_array(
-    store: Store,
+    bench_store: Store,
     layout: Layout,
     compression_name: CompressorName,
     pipeline: str,
@@ -72,7 +98,7 @@ def test_write_array(
     Test the time required to fill an array with a single value
     """
     arr = create_array(
-        store,
+        bench_store,
         dtype="uint8",
         shape=layout.shape,
         chunks=layout.chunks,
@@ -88,7 +114,7 @@ def test_write_array(
 @pytest.mark.parametrize("layout", layouts, ids=str)
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
 def test_read_array(
-    store: Store,
+    bench_store: Store,
     layout: Layout,
     compression_name: CompressorName,
     pipeline: str,
@@ -98,7 +124,7 @@ def test_read_array(
     Test the time required to fill an array with a single value
     """
     arr = create_array(
-        store,
+        bench_store,
         dtype="uint8",
         shape=layout.shape,
         chunks=layout.chunks,

From 241282a0bc8b77938272c6647e5c1306ffd6f8c4 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 1 May 2026 09:45:32 -0400
Subject: [PATCH 20/22] chore: restore deleted comments in V2Codec._decode_sync

Commit 7f45aba9 (which converted _decode_single -> _decode_sync) dropped
two explanatory comment blocks from the dtype-handling branches in
V2Codec.decode. Both comments document non-obvious WHY:

- The TypeError catch is for chunks whose stored dtype doesn't match
  the array spec dtype (e.g. string dtype vs object array).
- The elif branch fires when filters were tampered with: an object array
  needs an object codec in the filter chain to be read correctly.

These were removed as drive-by cleanup during the sync-method rename
without intent to delete the substance. Restoring verbatim.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/codecs/_v2.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
index bb34e31b8a..7fdf408d1d 100644
--- a/src/zarr/codecs/_v2.py
+++ b/src/zarr/codecs/_v2.py
@@ -48,9 +48,20 @@ def _decode_sync(
             try:
                 chunk = chunk.view(chunk_spec.dtype.to_native_dtype())
             except TypeError:
+                # this will happen if the dtype of the chunk
+                # does not match the dtype of the array spec i.g. if
+                # the dtype of the chunk_spec is a string dtype, but the chunk
+                # is an object array. In this case, we need to convert the object
+                # array to the correct dtype.
+
                 chunk = np.array(chunk).astype(chunk_spec.dtype.to_native_dtype())
 
         elif chunk.dtype != object:
+            # If we end up here, someone must have hacked around with the filters.
+            # We cannot deal with object arrays unless there is an object
+            # codec in the filter chain, i.e., a filter that converts from object
+            # array to something else during encoding, and converts back to object
+            # array during decoding.
             raise RuntimeError("cannot read object array without object codec")
 
         # ensure correct chunk shape

From 0324a852d7b234bfd39e3ec6e4f2768ab842ecc8 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 1 May 2026 09:48:12 -0400
Subject: [PATCH 21/22] docs: explain non-obvious behaviors in sharding sync
 methods

Add docstring substance and a couple of inline notes to the new sync
methods on ShardingCodec that landed on this branch. Concretely:

- _decode_sync / _encode_sync: explain how each relates to the async
  counterpart and the partial-* variants, and why inner chunks are
  iterated in Morton order on the encode path.
- _encode_shard_dict_sync: explain the two-pass offset shift in the
  index-at-start branch (offsets are written relative to the data
  section, then bumped by len(index_bytes)) and the MAX_UINT_64
  empty-chunk sentinel that must not be touched.
- _encode_partial_sync byte-range path: explain WHY morton-rank
  determines byte offset deterministically (fixed-size inner chunks =
  every slot at a stable offset regardless of which others are present);
  this is the load-bearing invariant for the byte-range fast path.
- _decode_partial_sync: docstring now lists the two sub-paths
  (full-shard fetch vs. index-then-byte-ranges) and the reason the
  full-shard branch exists (one round trip beats N+1 small ones).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/codecs/sharding.py | 60 +++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index a64ce2bdab..1461cec3f6 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -486,7 +486,17 @@ def _decode_sync(
         shard_bytes: Buffer,
         shard_spec: ArraySpec,
     ) -> NDBuffer:
-        """Decode a full shard synchronously."""
+        """Decode a full shard synchronously.
+
+        Sync counterpart to ``_decode_single``. Same semantics (decode every
+        inner chunk and assemble the full shard array) but routes through
+        ``ChunkTransform`` instead of the async codec pipeline, so it can
+        run on the sync codec-pipeline fast path without an event loop.
+
+        For a partial read where the caller only needs a slice of the shard,
+        use ``_decode_partial_sync`` instead — it fetches only the byte
+        ranges that overlap the selection.
+        """
         shard_shape = shard_spec.shape
         chunk_shape = self.chunk_shape
         chunks_per_shard = self._get_chunks_per_shard(shard_spec)
@@ -527,7 +537,22 @@ def _encode_sync(
         shard_array: NDBuffer,
         shard_spec: ArraySpec,
     ) -> Buffer | None:
-        """Encode a full shard synchronously."""
+        """Encode a full shard synchronously.
+
+        Sync counterpart to ``_encode_single``. Iterates inner chunks in
+        Morton (Z-curve) order — that's the canonical layout the shard
+        index expects — and encodes each through the inner ``ChunkTransform``.
+        Empty inner chunks become ``None`` entries when ``write_empty_chunks``
+        is False, signalling ``_encode_shard_dict_sync`` to elide them
+        from the data section and mark them empty in the shard index.
+
+        Returns ``None`` if every inner chunk was elided (an all-empty
+        shard) — callers treat that as "delete the shard key".
+
+        For a partial write that only touches some inner chunks, use
+        ``_encode_partial_sync`` instead — it patches affected slots in
+        place when possible.
+        """
         shard_shape = shard_spec.shape
         chunks_per_shard = self._get_chunks_per_shard(shard_spec)
         chunk_spec = self._get_chunk_spec(shard_spec)
@@ -632,6 +657,12 @@ def _encode_partial_sync(
                 # below can mutate it.
                 index = _ShardIndex(shard_reader.index.offsets_and_lengths.copy())
 
+                # Inner chunks are written in Morton (Z-curve) order, and
+                # because they're fixed-size we can compute a chunk's byte
+                # offset deterministically from its rank without consulting
+                # the shard index. This is what makes byte-range patching
+                # safe: the slot for a given chunk is always at the same
+                # offset regardless of which other chunks are present.
                 rank_map = {c: r for r, c in enumerate(morton_order_iter(chunks_per_shard))}
 
                 def _byte_offset(coords: tuple[int, ...]) -> int:
@@ -733,12 +764,23 @@ def _encode_shard_dict_sync(
         chunks_per_shard: tuple[int, ...],
         buffer_prototype: BufferPrototype,
     ) -> Buffer | None:
-        """Sync version of _encode_shard_dict."""
+        """Sync version of _encode_shard_dict.
+
+        Pack the encoded inner chunks (in Morton order) into a contiguous
+        data section, build a shard index that points each present chunk
+        at its offset/length within that section, and concatenate.
+
+        Returns ``None`` for an all-empty shard (no chunks present).
+        """
         index = _ShardIndex.create_empty(chunks_per_shard)
         buffers = []
         template = buffer_prototype.buffer.create_zero_length()
         chunk_start = 0
 
+        # First pass: lay out present chunks in the data section. Offsets
+        # here are relative to the start of the data section, not the start
+        # of the final blob — they get shifted in the index-at-start branch
+        # below to account for the index-bytes prefix.
         for chunk_coords in morton_order_iter(chunks_per_shard):
             value = shard_dict.get(chunk_coords)
             if value is None or len(value) == 0:
@@ -753,6 +795,11 @@ def _encode_shard_dict_sync(
 
         index_bytes = self._encode_shard_index_sync(index)
         if self.index_location == ShardingCodecIndexLocation.start:
+            # When the index is at the start of the blob, the data offsets
+            # we just wrote are off by len(index_bytes). Shift only the
+            # *present* chunks (empty chunks have offset == MAX_UINT_64 as
+            # a sentinel and must not be touched), then re-encode the index
+            # with the corrected offsets.
             empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64
             index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes)
             index_bytes = self._encode_shard_index_sync(index)
@@ -893,6 +940,13 @@ def _decode_partial_sync(
         Reads only the inner-chunk byte ranges that overlap ``selection``
         (plus the shard index) and decodes them through the inner codec
         chain.  The store must support ``get_sync`` with byte ranges.
+
+        Two sub-paths:
+        - If ``selection`` covers the entire shard, just fetch the whole
+          blob — that's strictly cheaper than two round trips (index, then
+          data) plus the per-chunk overhead of partial fetches.
+        - Otherwise fetch the index alone, look up only the byte slices of
+          the inner chunks the selection touches, fetch those, and decode.
         """
         shard_shape = shard_spec.shape
         chunk_shape = self.chunk_shape

From 2f97362ec91209290796fc20f5eba642a8e816b5 Mon Sep 17 00:00:00 2001
From: Davis Vann Bennett <davis.v.bennett@gmail.com>
Date: Fri, 1 May 2026 10:01:48 -0400
Subject: [PATCH 22/22] docs: convert RST inline literals to Markdown-style
 backticks

Docstrings added on this branch used RST-style ``literal`` markup
(double-backticks). Convert to Markdown-style `literal` (single
backticks) so the docstrings render correctly in Markdown-aware
viewers without needing a separate RST-to-Markdown step.

Two cases worth calling out:

- src/zarr/core/codec_pipeline.py and src/zarr/codecs/sharding.py:
  every ``literal`` in these files came in on this branch, so the
  conversion is global within those files.
- src/zarr/abc/store.py and src/zarr/core/array.py: only docstrings
  added on this branch are converted; pre-existing RST-style
  literals from main are left alone (out of scope).

Also converted one .. note:: directive in
src/zarr/core/array.py (the regular_chunk_array_spec helper) to
a Markdown blockquote, since that directive was added on this branch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/zarr/abc/store.py           |  6 +--
 src/zarr/codecs/sharding.py     | 34 ++++++-------
 src/zarr/core/array.py          | 15 +++---
 src/zarr/core/codec_pipeline.py | 90 ++++++++++++++++-----------------
 4 files changed, 73 insertions(+), 72 deletions(-)

diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py
index c33651f016..acd00c2fb8 100644
--- a/src/zarr/abc/store.py
+++ b/src/zarr/abc/store.py
@@ -714,9 +714,9 @@ async def set_if_not_exists(self, default: Buffer) -> None: ...
 class SupportsSetRange(Protocol):
     """Protocol for stores that support writing to a byte range within an existing value.
 
-    Overwrites ``len(value)`` bytes starting at byte offset ``start`` within the
-    existing stored value for ``key``. The key must already exist and the write
-    must fit within the existing value (i.e., ``start + len(value) <= len(existing)``).
+    Overwrites `len(value)` bytes starting at byte offset `start` within the
+    existing stored value for `key`. The key must already exist and the write
+    must fit within the existing value (i.e., `start + len(value) <= len(existing)`).
 
     Behavior when the write extends past the end of the existing value is
     implementation-specific and should not be relied upon.
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 1461cec3f6..8cbb09f660 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -488,13 +488,13 @@ def _decode_sync(
     ) -> NDBuffer:
         """Decode a full shard synchronously.
 
-        Sync counterpart to ``_decode_single``. Same semantics (decode every
+        Sync counterpart to `_decode_single`. Same semantics (decode every
         inner chunk and assemble the full shard array) but routes through
-        ``ChunkTransform`` instead of the async codec pipeline, so it can
+        `ChunkTransform` instead of the async codec pipeline, so it can
         run on the sync codec-pipeline fast path without an event loop.
 
         For a partial read where the caller only needs a slice of the shard,
-        use ``_decode_partial_sync`` instead — it fetches only the byte
+        use `_decode_partial_sync` instead — it fetches only the byte
         ranges that overlap the selection.
         """
         shard_shape = shard_spec.shape
@@ -539,18 +539,18 @@ def _encode_sync(
     ) -> Buffer | None:
         """Encode a full shard synchronously.
 
-        Sync counterpart to ``_encode_single``. Iterates inner chunks in
+        Sync counterpart to `_encode_single`. Iterates inner chunks in
         Morton (Z-curve) order — that's the canonical layout the shard
-        index expects — and encodes each through the inner ``ChunkTransform``.
-        Empty inner chunks become ``None`` entries when ``write_empty_chunks``
-        is False, signalling ``_encode_shard_dict_sync`` to elide them
+        index expects — and encodes each through the inner `ChunkTransform`.
+        Empty inner chunks become `None` entries when `write_empty_chunks`
+        is False, signalling `_encode_shard_dict_sync` to elide them
         from the data section and mark them empty in the shard index.
 
-        Returns ``None`` if every inner chunk was elided (an all-empty
+        Returns `None` if every inner chunk was elided (an all-empty
         shard) — callers treat that as "delete the shard key".
 
         For a partial write that only touches some inner chunks, use
-        ``_encode_partial_sync`` instead — it patches affected slots in
+        `_encode_partial_sync` instead — it patches affected slots in
         place when possible.
         """
         shard_shape = shard_spec.shape
@@ -594,15 +594,15 @@ def _encode_partial_sync(
         selection: SelectorTuple,
         shard_spec: ArraySpec,
     ) -> None:
-        """Sync equivalent of ``_encode_partial_single``.
+        """Sync equivalent of `_encode_partial_single`.
 
         Receives the source data for the written region (not a pre-merged
         shard array) and the selection within the shard, matching the
         calling convention of the async partial-encode path used by
-        ``BatchedCodecPipeline``.
+        `BatchedCodecPipeline`.
 
         When inner codecs are fixed-size and the store supports
-        ``set_range_sync``, partial writes update only the affected inner
+        `set_range_sync`, partial writes update only the affected inner
         chunks at their deterministic byte offsets.  Otherwise falls back
         to a full shard rewrite.
         """
@@ -770,7 +770,7 @@ def _encode_shard_dict_sync(
         data section, build a shard index that points each present chunk
         at its offset/length within that section, and concatenate.
 
-        Returns ``None`` for an all-empty shard (no chunks present).
+        Returns `None` for an all-empty shard (no chunks present).
         """
         index = _ShardIndex.create_empty(chunks_per_shard)
         buffers = []
@@ -935,14 +935,14 @@ def _decode_partial_sync(
         selection: SelectorTuple,
         shard_spec: ArraySpec,
     ) -> NDBuffer | None:
-        """Sync equivalent of ``_decode_partial_single``.
+        """Sync equivalent of `_decode_partial_single`.
 
-        Reads only the inner-chunk byte ranges that overlap ``selection``
+        Reads only the inner-chunk byte ranges that overlap `selection`
         (plus the shard index) and decodes them through the inner codec
-        chain.  The store must support ``get_sync`` with byte ranges.
+        chain.  The store must support `get_sync` with byte ranges.
 
         Two sub-paths:
-        - If ``selection`` covers the entire shard, just fetch the whole
+        - If `selection` covers the entire shard, just fetch the whole
           blob — that's strictly cheaper than two round trips (index, then
           data) plus the per-chunk overhead of partial fetches.
         - Otherwise fetch the index alone, look up only the byte slices of
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index 738bed0178..c05d1187c0 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -5407,13 +5407,14 @@ def _get_default_chunk_spec(
     build the ArraySpec once and reuse it for every chunk — avoiding the
     per-chunk ChunkGrid.__getitem__ + ArraySpec construction overhead.
 
-    .. note::
-        Ideally the per-chunk ArraySpec would not exist at all: dtype,
-        fill_value, config, and prototype are constant across chunks —
-        only the shape varies (and only for edge chunks). A cleaner
-        design would pass a single ArraySpec plus a per-chunk shape
-        override, which ChunkTransform.decode_chunk already supports
-        via its ``chunk_shape`` parameter.
+    > **Note**
+    >
+    > Ideally the per-chunk ArraySpec would not exist at all: dtype,
+    > fill_value, config, and prototype are constant across chunks —
+    > only the shape varies (and only for edge chunks). A cleaner
+    > design would pass a single ArraySpec plus a per-chunk shape
+    > override, which ChunkTransform.decode_chunk already supports
+    > via its `chunk_shape` parameter.
     """
     if chunk_grid.is_regular:
         return ArraySpec(
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 49f3320c15..60885e7f73 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -41,19 +41,19 @@
 
 
 def _resolve_max_workers() -> int:
-    """Resolve ``codec_pipeline.max_workers`` config to an effective worker count.
+    """Resolve `codec_pipeline.max_workers` config to an effective worker count.
 
-    ``None`` means "auto" → ``os.cpu_count()`` (or 1 if unavailable).
+    `None` means "auto" → `os.cpu_count()` (or 1 if unavailable).
     Values < 1 are clamped to 1 (sequential).
 
     Notes
     -----
-    The default (``None`` → ``cpu_count``) is tuned for large chunks
+    The default (`None` → `cpu_count`) is tuned for large chunks
     (≳ 1 MB encoded) where per-chunk decode + scatter is real work and
     threading helps. For small chunks (≲ 64 KB) the per-task pool
     overhead (≈ 30-50 µs submit + worker handoff) outweighs the work
     and threading slows things down by 1.5-3x. If your workload uses
-    many small chunks, set ``codec_pipeline.max_workers=1`` explicitly:
+    many small chunks, set `codec_pipeline.max_workers=1` explicitly:
 
         zarr.config.set({"codec_pipeline.max_workers": 1})
 
@@ -70,7 +70,7 @@ def _resolve_max_workers() -> int:
 
 
 def _get_pool(max_workers: int) -> ThreadPoolExecutor:
-    """Get or create the module-level thread pool, sized to ``max_workers``.
+    """Get or create the module-level thread pool, sized to `max_workers`.
 
     The pool grows on demand — if a request arrives for more workers than
     the current pool has, the existing pool is shut down and replaced.
@@ -78,7 +78,7 @@ def _get_pool(max_workers: int) -> ThreadPoolExecutor:
     workers idle).
 
     Callers that want sequential execution should not call this — they
-    should run the task list inline. ``max_workers`` must be >= 1.
+    should run the task list inline. `max_workers` must be >= 1.
     """
     global _pool, _pool_size
     if max_workers < 1:
@@ -136,13 +136,13 @@ def _merge_chunk_array(
     is_complete_chunk: bool,
     drop_axes: tuple[int, ...],
 ) -> NDBuffer:
-    """Merge ``value`` into a full-chunk-shaped NDBuffer at ``chunk_selection``.
+    """Merge `value` into a full-chunk-shaped NDBuffer at `chunk_selection`.
 
-    If ``is_complete_chunk`` and ``value`` already covers the full chunk
-    shape, ``value`` is returned directly (no copy). Otherwise, a writable
-    buffer is materialized — either from ``existing_chunk_array.copy()`` if
+    If `is_complete_chunk` and `value` already covers the full chunk
+    shape, `value` is returned directly (no copy). Otherwise, a writable
+    buffer is materialized — either from `existing_chunk_array.copy()` if
     one was read from the store, or freshly allocated and filled with the
-    chunk's fill value — and the relevant slice of ``value`` is written into it.
+    chunk's fill value — and the relevant slice of `value` is written into it.
     """
     if (
         is_complete_chunk
@@ -183,13 +183,13 @@ async def _async_read_fallback(
 ) -> tuple[GetResult, ...]:
     """Async fallback read used when no fast-path is available.
 
-    Fetches every chunk's bytes via ``concurrent_map`` (sized by
-    ``async.concurrency``), decodes the batch through ``pipeline.decode``,
-    then scatters each decoded chunk into ``out`` at its ``out_selection``.
+    Fetches every chunk's bytes via `concurrent_map` (sized by
+    `async.concurrency`), decodes the batch through `pipeline.decode`,
+    then scatters each decoded chunk into `out` at its `out_selection`.
 
-    Used by both ``BatchedCodecPipeline.read_batch`` (non-partial-decode
-    branch) and ``FusedCodecPipeline.read`` (when the store is not a
-    ``SupportsGetSync`` / sync transform is unavailable).
+    Used by both `BatchedCodecPipeline.read_batch` (non-partial-decode
+    branch) and `FusedCodecPipeline.read` (when the store is not a
+    `SupportsGetSync` / sync transform is unavailable).
     """
     chunk_bytes_batch = await concurrent_map(
         [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch],
@@ -226,17 +226,17 @@ async def _async_write_fallback(
 ) -> None:
     """Async fallback write used when no fast-path is available.
 
-    For each chunk in ``batch``: read its existing bytes from the store
+    For each chunk in `batch`: read its existing bytes from the store
     (skipping the read for complete chunks), decode the batch via
-    ``pipeline.decode``, merge ``value`` into each decoded chunk via
-    ``_merge_chunk_array``, drop chunks that are all-fill when
-    ``write_empty_chunks`` is False, encode the surviving chunks via
-    ``pipeline.encode``, then ``set`` the encoded bytes (or ``delete``
-    if encoding produced ``None`` or the chunk dropped).
-
-    Used by both ``BatchedCodecPipeline.write_batch`` (non-partial-encode
-    branch) and ``FusedCodecPipeline.write`` (when the store is not a
-    ``SupportsSetSync`` / sync transform is unavailable).
+    `pipeline.decode`, merge `value` into each decoded chunk via
+    `_merge_chunk_array`, drop chunks that are all-fill when
+    `write_empty_chunks` is False, encode the surviving chunks via
+    `pipeline.encode`, then `set` the encoded bytes (or `delete`
+    if encoding produced `None` or the chunk dropped).
+
+    Used by both `BatchedCodecPipeline.write_batch` (non-partial-encode
+    branch) and `FusedCodecPipeline.write` (when the store is not a
+    `SupportsSetSync` / sync transform is unavailable).
     """
 
     async def _read_key(
@@ -364,14 +364,14 @@ def __post_init__(self) -> None:
     _cached_ab_spec: ArraySpec | None = field(init=False, repr=False, compare=False, default=None)
 
     def _resolve_specs(self, chunk_spec: ArraySpec) -> tuple[tuple[ArraySpec, ...], ArraySpec]:
-        """Return per-AA-codec input specs and the AB spec for ``chunk_spec``.
+        """Return per-AA-codec input specs and the AB spec for `chunk_spec`.
 
-        The codec chain only changes ``shape`` (via TransposeCodec etc.) —
-        ``prototype``, ``dtype``, ``fill_value``, and ``config`` are
+        The codec chain only changes `shape` (via TransposeCodec etc.) —
+        `prototype`, `dtype`, `fill_value`, and `config` are
         invariant. We cache the resolved spec chain keyed on
-        ``(chunk_spec.shape, id(chunk_spec))``, and reuse it directly
-        when the same ``chunk_spec`` is passed again. For a different
-        ``chunk_spec`` with the same shape, we recompute (cheap).
+        `(chunk_spec.shape, id(chunk_spec))`, and reuse it directly
+        when the same `chunk_spec` is passed again. For a different
+        `chunk_spec` with the same shape, we recompute (cheap).
         """
         if not self._aa_codecs:
             return (), chunk_spec
@@ -818,15 +818,15 @@ class FusedCodecPipeline(CodecPipeline):
     """Codec pipeline that uses the codec chain directly.
 
     Separates IO from compute without an intermediate layout abstraction.
-    The ShardingCodec handles shard IO internally via its ``_decode_sync``
-    and ``_encode_sync`` methods, so the pipeline simply:
+    The ShardingCodec handles shard IO internally via its `_decode_sync`
+    and `_encode_sync` methods, so the pipeline simply:
 
     1. Fetches the raw blob from the store (one key per chunk/shard).
     2. Decodes/encodes through the codec chain (pure compute).
     3. Writes the result back.
 
-    A ``ChunkTransform`` wraps the codec chain for fast synchronous
-    decode/encode when all codecs support ``SupportsSyncCodec``.
+    A `ChunkTransform` wraps the codec chain for fast synchronous
+    decode/encode when all codecs support `SupportsSyncCodec`.
     """
 
     codecs: tuple[Codec, ...]
@@ -951,17 +951,17 @@ def read_sync(
     ) -> tuple[GetResult, ...]:
         """Synchronous read: fetch -> decode -> scatter, per chunk.
 
-        When ``max_workers > 1`` and there are multiple chunks, each
+        When `max_workers > 1` and there are multiple chunks, each
         chunk's full lifecycle (fetch + decode + scatter) runs as one
-        task on a thread pool sized to ``max_workers`` — overlapping IO
+        task on a thread pool sized to `max_workers` — overlapping IO
         of one chunk with decode/scatter of another. Scatter is
         thread-safe because the chunks have non-overlapping output
         selections.
 
-        ``max_workers=1`` runs everything sequentially in the calling
+        `max_workers=1` runs everything sequentially in the calling
         thread (no pool involvement).
 
-        Mirrors ``BatchedCodecPipeline.read_batch``: when the AB codec
+        Mirrors `BatchedCodecPipeline.read_batch`: when the AB codec
         supports partial decoding (e.g. sharding), the codec handles its
         own IO and only fetches the inner-chunk byte ranges that overlap
         the read selection. Otherwise the pipeline fetches the full
@@ -1032,18 +1032,18 @@ def write_sync(
     ) -> None:
         """Synchronous write: fetch existing -> merge+encode -> store.
 
-        When ``max_workers > 1`` and there are multiple chunks, each
+        When `max_workers > 1` and there are multiple chunks, each
         chunk's full lifecycle (get-existing + merge + encode + set/delete)
-        runs as one task on a thread pool sized to ``max_workers`` —
+        runs as one task on a thread pool sized to `max_workers` —
         overlapping IO of one chunk with compute of another.
 
-        ``max_workers=1`` runs everything sequentially in the calling
+        `max_workers=1` runs everything sequentially in the calling
         thread (no pool involvement).
 
         When the codec pipeline supports partial encoding (e.g. a
         sharding codec with no outer AA/BB codecs), the AB codec handles
         the full write cycle — reading existing data, merging, encoding,
-        and writing — matching the async ``BatchedCodecPipeline`` path.
+        and writing — matching the async `BatchedCodecPipeline` path.
         """
         assert self._sync_transform is not None
         transform = self._sync_transform