diff --git a/.gitignore b/.gitignore
index 3284865d6c..59b6632a3c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,5 +92,9 @@ tests/.hypothesis
 zarr/version.py
 zarr.egg-info/
 
+# Local agent / planning notes (not versioned)
+.claude/
+CLAUDE.md
+docs/superpowers/
 # zarr-metadata package lockfile (a library, not an app)
 packages/zarr-metadata/uv.lock
diff --git a/changes/3885.feature.1.md b/changes/3885.feature.1.md
new file mode 100644
index 0000000000..0a51ba6844
--- /dev/null
+++ b/changes/3885.feature.1.md
@@ -0,0 +1 @@
+Added `SyncByteGetter` and `SyncByteSetter` runtime-checkable protocols and a `get_ranges_sync` method on the `Store` ABC. These let custom byte getters/setters opt into the synchronous codec pipeline's fast path for in-memory IO, which the sharding codec uses for its inner chunks.
diff --git a/changes/3885.feature.md b/changes/3885.feature.md
new file mode 100644
index 0000000000..9f4cd5cfbc
--- /dev/null
+++ b/changes/3885.feature.md
@@ -0,0 +1 @@
+`FusedCodecPipeline` is now the default codec pipeline. It runs codec compute synchronously and in bulk (avoiding the per-chunk async scheduling overhead of `BatchedCodecPipeline`), giving large speedups for sharded arrays (up to ~24x writes / ~14x reads on many-chunks-per-shard layouts, more with compression) and no regressions on compute-bound workloads. The previous behavior is available by setting `zarr.config.set({"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"})`.
diff --git a/pyproject.toml b/pyproject.toml
index 9f6005f981..c7f99a454e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -445,6 +445,18 @@ filterwarnings = [
     # collected without being entered. This is a known issue in s3fs/aiobotocore, and pytest
     # per-test filterwarnings markers can't catch it (https://github.com/pytest-dev/pytest/issues/14096).
     "ignore:Exception ignored ((on calling weakref callback)|(in[\\s\\S]*Session was never entered)):pytest.PytestUnraisableExceptionWarning",
+    # pytest-asyncio implicitly creates an event loop in _get_event_loop_no_warn during
+    # fixture setup/teardown and never closes it (allocation site verified with
+    # PYTHONTRACEMALLOC: pytest_asyncio/plugin.py). When the garbage collector reclaims
+    # that loop (and its self-pipe socketpair: AF_UNIX family=1 on POSIX, emulated with
+    # AF_INET family=2 on Windows) mid-test, the unraisable hook fails whichever unrelated
+    # test happens to be running — the long-standing "random cross-file failure" in the
+    # pipeline suites. The message contains only the __del__ repr, so these patterns cannot
+    # scope to pytest-asyncio specifically: a loop/socketpair leak in zarr's own sync
+    # machinery would also be silenced. Accepted tradeoff — revisit if zarr.core.sync grows
+    # loop-lifecycle changes.
+    "ignore:Exception ignored in[\\s\\S]*<function BaseEventLoop.__del__:pytest.PytestUnraisableExceptionWarning",
+    "ignore:Exception ignored in[\\s\\S]*<socket.socket fd=\\d+, family=(1|2), type=1:pytest.PytestUnraisableExceptionWarning",
 ]
 markers = [
     "asyncio: mark test as asyncio test",
diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py
index 304d0cddb5..ebbcae47b1 100644
--- a/src/zarr/abc/store.py
+++ b/src/zarr/abc/store.py
@@ -22,6 +22,8 @@
     "SupportsGetSync",
     "SupportsSetSync",
     "SupportsSyncStore",
+    "SyncByteGetter",
+    "SyncByteSetter",
     "set_or_delete",
 ]
 
@@ -469,6 +471,73 @@ async def get_ranges(
         ):
             yield group
 
+    def get_ranges_sync(
+        self,
+        key: str,
+        byte_ranges: Sequence[ByteRequest | None],
+        *,
+        prototype: BufferPrototype,
+        max_gap_bytes: int = 1 << 20,  # 1 MiB
+        max_coalesced_bytes: int = 16 << 20,  # 16 MiB
+    ) -> Sequence[tuple[int, Buffer | None]]:
+        """Synchronous, coalescing counterpart of `get_ranges`.
+
+        Plans merged fetches with the same `coalesce_ranges` policy as the async
+        path, then issues one synchronous `get_sync` per merged group (or per
+        uncoalescable request) and slices results back into per-input buffers.
+        Used by the sync codec pipeline's partial-shard reads so they get the
+        same byte-range coalescing as the async path, without an event loop.
+
+        Returns a list of `(input_index, Buffer | None)`. Raises
+        `BaseExceptionGroup` containing a `FileNotFoundError` if the key is
+        absent (matching `get_ranges`), so callers can handle a deleted shard
+        uniformly across the sync and async paths.
+
+        Requires the store to implement `get_sync` (`SupportsGetSync`).
+        """
+        from zarr.core._coalesce import coalesce_ranges
+
+        if not isinstance(self, SupportsGetSync):
+            raise TypeError(f"{type(self).__name__} does not support synchronous reads")
+
+        groups, uncoalescable = coalesce_ranges(
+            byte_ranges, max_gap_bytes=max_gap_bytes, max_coalesced_bytes=max_coalesced_bytes
+        )
+        results: list[tuple[int, Buffer | None]] = []
+        errors: list[BaseException] = []
+
+        def _get(req: ByteRequest | None) -> Buffer | None:
+            return self.get_sync(key, prototype=prototype, byte_range=req)
+
+        for idx, req in uncoalescable:
+            buf = _get(req)
+            if buf is None:
+                errors.append(FileNotFoundError(key))
+            else:
+                results.append((idx, buf))
+
+        for members in groups:
+            if len(members) == 1:
+                solo_idx, solo_req = members[0]
+                buf = _get(solo_req)
+                if buf is None:
+                    errors.append(FileNotFoundError(key))
+                else:
+                    results.append((solo_idx, buf))
+                continue
+            start = members[0][1].start
+            end = max(r.end for _, r in members)
+            big = _get(RangeByteRequest(start, end))
+            if big is None:
+                errors.append(FileNotFoundError(key))
+                continue
+            for member_idx, r in members:
+                results.append((member_idx, big[r.start - start : r.end - start]))
+
+        if errors:
+            raise BaseExceptionGroup("chunk read failed", errors)
+        return results
+
     async def getsize(self, key: str) -> int:
         """
         Return the size, in bytes, of a value in a Store.
@@ -562,6 +631,33 @@ async def delete(self) -> None: ...
     async def set_if_not_exists(self, default: Buffer) -> None: ...
 
 
+@runtime_checkable
+class SyncByteGetter(Protocol):
+    """A `ByteGetter` that can also fetch synchronously, without an event loop.
+
+    Non-StorePath byte getters (e.g. the sharding codec's in-memory
+    `_ShardingByteGetter`) implement this so a synchronous codec pipeline can
+    take its sync fast path on them instead of scheduling one coroutine per
+    chunk. Note that `StorePath` also *has* a `get_sync` method (so it matches
+    this protocol structurally) but it only works when its store supports
+    synchronous IO — callers gate `StorePath` on the store's `SupportsGetSync`
+    instead of on this protocol.
+    """
+
+    def get_sync(
+        self, prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None
+    ) -> Buffer | None: ...
+
+
+@runtime_checkable
+class SyncByteSetter(SyncByteGetter, Protocol):
+    """A `ByteSetter` that can also write synchronously. See `SyncByteGetter`."""
+
+    def set_sync(self, value: Buffer) -> None: ...
+
+    def delete_sync(self) -> None: ...
+
+
 @runtime_checkable
 class SupportsGetSync(Protocol):
     def get_sync(
diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
index 3c6c99c21c..7fdf408d1d 100644
--- a/src/zarr/codecs/_v2.py
+++ b/src/zarr/codecs/_v2.py
@@ -23,7 +23,7 @@ class V2Codec(ArrayBytesCodec):
 
     is_fixed_size = False
 
-    async def _decode_single(
+    def _decode_sync(
         self,
         chunk_bytes: Buffer,
         chunk_spec: ArraySpec,
@@ -31,14 +31,14 @@ async def _decode_single(
         cdata = chunk_bytes.as_array_like()
         # decompress
         if self.compressor:
-            chunk = await asyncio.to_thread(self.compressor.decode, cdata)
+            chunk = self.compressor.decode(cdata)
         else:
             chunk = cdata
 
         # apply filters
         if self.filters:
             for f in reversed(self.filters):
-                chunk = await asyncio.to_thread(f.decode, chunk)
+                chunk = f.decode(chunk)
 
         # view as numpy array with correct dtype
         chunk = ensure_ndarray_like(chunk)
@@ -70,7 +70,7 @@ async def _decode_single(
 
         return get_ndbuffer_class().from_ndarray_like(chunk)
 
-    async def _encode_single(
+    def _encode_sync(
         self,
         chunk_array: NDBuffer,
         chunk_spec: ArraySpec,
@@ -83,18 +83,32 @@ async def _encode_single(
         # apply filters
         if self.filters:
             for f in self.filters:
-                chunk = await asyncio.to_thread(f.encode, chunk)
+                chunk = f.encode(chunk)
         # check object encoding
         if ensure_ndarray_like(chunk).dtype == object:
             raise RuntimeError("cannot write object array without object codec")
 
         # compress
         if self.compressor:
-            cdata = await asyncio.to_thread(self.compressor.encode, chunk)
+            cdata = self.compressor.encode(chunk)
         else:
             cdata = chunk
         cdata = ensure_bytes(cdata)
         return chunk_spec.prototype.buffer.from_bytes(cdata)
 
+    async def _decode_single(
+        self,
+        chunk_bytes: Buffer,
+        chunk_spec: ArraySpec,
+    ) -> NDBuffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_bytes, chunk_spec)
+
+    async def _encode_single(
+        self,
+        chunk_array: NDBuffer,
+        chunk_spec: ArraySpec,
+    ) -> Buffer | None:
+        return await asyncio.to_thread(self._encode_sync, chunk_array, chunk_spec)
+
     def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         raise NotImplementedError
diff --git a/src/zarr/codecs/numcodecs/_codecs.py b/src/zarr/codecs/numcodecs/_codecs.py
index 1be1381a08..8b84ce538a 100644
--- a/src/zarr/codecs/numcodecs/_codecs.py
+++ b/src/zarr/codecs/numcodecs/_codecs.py
@@ -47,7 +47,7 @@
 if TYPE_CHECKING:
     from zarr.abc.numcodec import Numcodec
     from zarr.core.array_spec import ArraySpec
-    from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer
+    from zarr.core.buffer import Buffer, NDBuffer
 
 CODEC_PREFIX = "numcodecs."
 
@@ -134,53 +134,63 @@ class _NumcodecsBytesBytesCodec(_NumcodecsCodec, BytesBytesCodec):
     def __init__(self, **codec_config: JSON) -> None:
         super().__init__(**codec_config)
 
-    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
-        return await asyncio.to_thread(
-            as_numpy_array_wrapper,
-            self._codec.decode,
-            chunk_data,
-            chunk_spec.prototype,
-        )
+    def _decode_sync(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
+        return as_numpy_array_wrapper(self._codec.decode, chunk_data, chunk_spec.prototype)
 
-    def _encode(self, chunk_data: Buffer, prototype: BufferPrototype) -> Buffer:
+    def _encode_sync(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
         encoded = self._codec.encode(chunk_data.as_array_like())
         if isinstance(encoded, np.ndarray):  # Required for checksum codecs
-            return prototype.buffer.from_bytes(encoded.tobytes())
-        return prototype.buffer.from_bytes(encoded)
+            return chunk_spec.prototype.buffer.from_bytes(encoded.tobytes())
+        return chunk_spec.prototype.buffer.from_bytes(encoded)
+
+    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_data, chunk_spec)
 
     async def _encode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> Buffer:
-        return await asyncio.to_thread(self._encode, chunk_data, chunk_spec.prototype)
+        return await asyncio.to_thread(self._encode_sync, chunk_data, chunk_spec)
 
 
 class _NumcodecsArrayArrayCodec(_NumcodecsCodec, ArrayArrayCodec):
     def __init__(self, **codec_config: JSON) -> None:
         super().__init__(**codec_config)
 
-    async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+    def _decode_sync(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
         chunk_ndarray = chunk_data.as_ndarray_like()
-        out = await asyncio.to_thread(self._codec.decode, chunk_ndarray)
+        out = self._codec.decode(chunk_ndarray)
         return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
 
-    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+    def _encode_sync(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
         chunk_ndarray = chunk_data.as_ndarray_like()
-        out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
+        out = self._codec.encode(chunk_ndarray)
         return chunk_spec.prototype.nd_buffer.from_ndarray_like(out)
 
+    async def _decode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_data, chunk_spec)
+
+    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+        return await asyncio.to_thread(self._encode_sync, chunk_data, chunk_spec)
+
 
 class _NumcodecsArrayBytesCodec(_NumcodecsCodec, ArrayBytesCodec):
     def __init__(self, **codec_config: JSON) -> None:
         super().__init__(**codec_config)
 
-    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
+    def _decode_sync(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
         chunk_bytes = chunk_data.to_bytes()
-        out = await asyncio.to_thread(self._codec.decode, chunk_bytes)
+        out = self._codec.decode(chunk_bytes)
         return chunk_spec.prototype.nd_buffer.from_ndarray_like(out.reshape(chunk_spec.shape))
 
-    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
+    def _encode_sync(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
         chunk_ndarray = chunk_data.as_ndarray_like()
-        out = await asyncio.to_thread(self._codec.encode, chunk_ndarray)
+        out = self._codec.encode(chunk_ndarray)
         return chunk_spec.prototype.buffer.from_bytes(out)
 
+    async def _decode_single(self, chunk_data: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
+        return await asyncio.to_thread(self._decode_sync, chunk_data, chunk_spec)
+
+    async def _encode_single(self, chunk_data: NDBuffer, chunk_spec: ArraySpec) -> Buffer:
+        return await asyncio.to_thread(self._encode_sync, chunk_data, chunk_spec)
+
 
 # bytes-to-bytes codecs
 class Blosc(_NumcodecsBytesBytesCodec, codec_name="blosc"):
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index 657484e9af..ab8282338e 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass, replace
 from enum import Enum
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any, Literal, NamedTuple, cast
+from typing import TYPE_CHECKING, Any, Literal, NamedTuple
 
 import numpy as np
 import numpy.typing as npt
@@ -15,13 +15,16 @@
     ArrayBytesCodecPartialEncodeMixin,
     Codec,
     CodecPipeline,
+    SupportsSyncCodec,
 )
 from zarr.abc.store import (
     ByteGetter,
     ByteRequest,
     ByteSetter,
     RangeByteRequest,
+    Store,
     SuffixByteRequest,
+    SupportsGetSync,
 )
 from zarr.codecs.bytes import BytesCodec
 from zarr.codecs.crc32c_ import Crc32cCodec
@@ -41,6 +44,8 @@
     parse_shapelike,
     product,
 )
+from zarr.core.config import config as zarr_config
+from zarr.core.dtype.common import HasEndianness
 from zarr.core.dtype.npy.int import UInt64
 from zarr.core.indexing import (
     BasicIndexer,
@@ -96,13 +101,21 @@ def parse_index_location(data: object) -> ShardingCodecIndexLocation:
 
 @dataclass(frozen=True)
 class _ShardingByteGetter(ByteGetter):
+    """In-memory byte getter for one inner chunk of a shard.
+
+    Implements `SyncByteGetter` (dict access needs no event loop), so the
+    synchronous codec pipeline takes its sync fast path on inner chunks
+    instead of scheduling one coroutine per chunk; the async `get` simply
+    delegates to `get_sync`.
+    """
+
     shard_dict: ShardMapping
     chunk_coords: tuple[int, ...]
 
-    async def get(
-        self, prototype: BufferPrototype, byte_range: ByteRequest | None = None
+    def get_sync(
+        self, prototype: BufferPrototype | None = None, byte_range: ByteRequest | None = None
     ) -> Buffer | None:
-        assert prototype == default_buffer_prototype(), (
+        assert prototype is None or prototype == default_buffer_prototype(), (
             f"prototype is not supported within shards currently. diff: {prototype} != {default_buffer_prototype()}"
         )
         value = self.shard_dict.get(self.chunk_coords)
@@ -113,17 +126,33 @@ async def get(
         start, stop = _normalize_byte_range_index(value, byte_range)
         return value[start:stop]
 
+    async def get(
+        self, prototype: BufferPrototype, byte_range: ByteRequest | None = None
+    ) -> Buffer | None:
+        return self.get_sync(prototype, byte_range)
+
 
 @dataclass(frozen=True)
 class _ShardingByteSetter(_ShardingByteGetter, ByteSetter):
+    """In-memory byte setter for one inner chunk of a shard.
+
+    Implements `SyncByteSetter`; the async methods delegate to the sync ones.
+    """
+
     shard_dict: ShardMutableMapping
 
+    def set_sync(self, value: Buffer) -> None:
+        self.shard_dict[self.chunk_coords] = value
+
+    def delete_sync(self) -> None:
+        del self.shard_dict[self.chunk_coords]
+
     async def set(self, value: Buffer, byte_range: ByteRequest | None = None) -> None:
         assert byte_range is None, "byte_range is not supported within shards"
-        self.shard_dict[self.chunk_coords] = value
+        self.set_sync(value)
 
     async def delete(self) -> None:
-        del self.shard_dict[self.chunk_coords]
+        self.delete_sync()
 
     async def set_if_not_exists(self, default: Buffer) -> None:
         self.shard_dict.setdefault(self.chunk_coords, default)
@@ -147,6 +176,24 @@ def is_all_empty(self) -> bool:
     def get_full_chunk_map(self) -> npt.NDArray[np.bool_]:
         return np.not_equal(self.offsets_and_lengths[..., 0], MAX_UINT_64)
 
+    def is_dense(self, chunk_byte_length: int) -> bool:
+        """True when every chunk is present, fixed-length, and uniquely placed.
+
+        Used to gate the vectorized whole-shard decode: a dense fixed-size shard
+        is a regular grid of equal-length payloads, so it can be reshaped/scattered
+        in bulk rather than decoded chunk-by-chunk.
+        """
+        offsets = self.offsets_and_lengths[..., 0].reshape(-1)
+        lengths = self.offsets_and_lengths[..., 1].reshape(-1)
+        # all present
+        if bool(np.any(offsets == MAX_UINT_64)):
+            return False
+        # all the same fixed length
+        if not bool(np.all(lengths == chunk_byte_length)):
+            return False
+        # offsets unique (no two chunks share a slot)
+        return int(np.unique(offsets).size) == int(offsets.size)
+
     def get_chunk_slice(self, chunk_coords: tuple[int, ...]) -> tuple[int, int] | None:
         localized_chunk = self._localize_chunk(chunk_coords)
         chunk_start, chunk_len = self.offsets_and_lengths[localized_chunk]
@@ -301,6 +348,8 @@ class ShardingCodec(
     does not recover it (the setting reverts to the `morton` default per codec instance).
     """
 
+    is_fixed_size = False
+
     chunk_shape: tuple[int, ...]
     codecs: tuple[Codec, ...]
     index_codecs: tuple[Codec, ...]
@@ -339,6 +388,13 @@ def __init__(
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
         object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec))
         object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard))
+        object.__setattr__(self, "_shard_index_size", lru_cache()(self._shard_index_size))
+        object.__setattr__(
+            self, "_get_inner_chunk_transform", lru_cache()(self._get_inner_chunk_transform)
+        )
+        object.__setattr__(
+            self, "_get_index_chunk_transform", lru_cache()(self._get_index_chunk_transform)
+        )
 
     # todo: typedict return type
     def __getstate__(self) -> dict[str, Any]:
@@ -358,6 +414,13 @@ def __setstate__(self, state: dict[str, Any]) -> None:
         # object.__setattr__(self, "_get_chunk_spec", lru_cache()(self._get_chunk_spec))
         object.__setattr__(self, "_get_index_chunk_spec", lru_cache()(self._get_index_chunk_spec))
         object.__setattr__(self, "_get_chunks_per_shard", lru_cache()(self._get_chunks_per_shard))
+        object.__setattr__(self, "_shard_index_size", lru_cache()(self._shard_index_size))
+        object.__setattr__(
+            self, "_get_inner_chunk_transform", lru_cache()(self._get_inner_chunk_transform)
+        )
+        object.__setattr__(
+            self, "_get_index_chunk_transform", lru_cache()(self._get_index_chunk_transform)
+        )
 
     @classmethod
     def from_dict(cls, data: dict[str, JSON]) -> Self:
@@ -366,8 +429,48 @@ def from_dict(cls, data: dict[str, JSON]) -> Self:
 
     @property
     def codec_pipeline(self) -> CodecPipeline:
+        # Resolve against the configured pipeline (registry default), matching the
+        # rest of this module's use of get_pipeline_class — NOT a hard-coded
+        # BatchedCodecPipeline. This restores main's behavior (#2179) that the
+        # branch had reverted: the inner sub-chunk pipeline follows the same
+        # codec_pipeline.path config as the outer array.
         return get_pipeline_class().from_codecs(self.codecs)
 
+    def _get_inner_pipeline(self, shard_spec: ArraySpec) -> CodecPipeline:
+        """The nested pipeline for inner-chunk IO, evolved against the inner
+        chunk spec.
+
+        Evolving matters for two reasons: it threads the spec through the inner
+        codec chain (spec-changing codecs see the spec they will actually
+        operate on), and — for a synchronous pipeline — it builds the sync
+        transform, so inner-chunk IO over the (sync-capable) sharding byte
+        getters takes the pipeline's sync fast path instead of scheduling one
+        coroutine per inner chunk. The bare `codec_pipeline` property returns an
+        unevolved pipeline, which a synchronous pipeline can only run through
+        its async fallback.
+
+        Memoized per (pipeline class, batch size, shard_spec): evolving builds
+        a ChunkTransform, which is wasteful to redo on every shard operation.
+        The pipeline class and batch size participate in the key so the
+        `codec_pipeline.path` and `codec_pipeline.batch_size` configs are still
+        honored after the first use (`from_codecs` captures batch_size at
+        construction). A benign construction race between threads is possible
+        (last writer wins) — same as the other caches here.
+        """
+        cache: dict[tuple[type[CodecPipeline], int, ArraySpec], CodecPipeline] | None = getattr(
+            self, "_inner_pipeline_cache", None
+        )
+        if cache is None:
+            cache = {}
+            object.__setattr__(self, "_inner_pipeline_cache", cache)
+        key = (get_pipeline_class(), zarr_config.get("codec_pipeline.batch_size"), shard_spec)
+        pipeline = cache.get(key)
+        if pipeline is None:
+            chunk_spec = self._get_chunk_spec(shard_spec)
+            pipeline = self.codec_pipeline.evolve_from_array_spec(chunk_spec)
+            cache[key] = pipeline
+        return pipeline
+
     def to_dict(self) -> dict[str, JSON]:
         return {
             "name": "sharding_indexed",
@@ -380,8 +483,16 @@ def to_dict(self) -> dict[str, JSON]:
         }
 
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        # Thread the spec through the inner chain (`evolve_codecs`): each codec
+        # is evolved against the spec produced by the previous one. Evolving
+        # every codec against the same unthreaded spec is the bug shape that
+        # strips `BytesCodec.endian` behind a dtype-changing codec — and this
+        # method runs on the real array-creation path, baking the damaged chain
+        # into the evolved instance before the transform builders ever run.
+        from zarr.core.codec_pipeline import evolve_codecs
+
         shard_spec = self._get_chunk_spec(array_spec)
-        evolved_codecs = tuple(c.evolve_from_array_spec(array_spec=shard_spec) for c in self.codecs)
+        evolved_codecs = evolve_codecs(self.codecs, shard_spec)
         if evolved_codecs != self.codecs:
             return replace(self, codecs=evolved_codecs)
         return self
@@ -416,6 +527,395 @@ def validate(
                         f"divisible by the shard's inner chunk size {inner}."
                     )
 
+    def _get_inner_chunk_transform(self, shard_spec: ArraySpec) -> Any:
+        """The synchronous transform for the inner codec chain.
+
+        Memoized by the instance-local `lru_cache` wrapping installed in
+        `__init__`/`__setstate__` (the single cache mechanism for these
+        builders — do not add another layer inside the body).
+
+        Codecs are evolved with the spec THREADED forward (`evolve_codecs`):
+        each inner codec is evolved against the spec produced by the previous
+        one, not the original chunk spec. Evolving every codec against the same
+        unthreaded spec is the bug shape that stripped `BytesCodec.endian` at
+        the pipeline level (see `evolve_codecs`) — the inner chain must use the
+        same single source of truth.
+        """
+        from zarr.core.codec_pipeline import ChunkTransform, evolve_codecs
+
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        return ChunkTransform(codecs=evolve_codecs(self.codecs, chunk_spec))
+
+    def _get_index_chunk_transform(self, chunks_per_shard: tuple[int, ...]) -> Any:
+        """The synchronous transform for the index codec chain.
+
+        Memoized via instance-local `lru_cache` and spec-threaded for the same
+        reasons as `_get_inner_chunk_transform`.
+        """
+        from zarr.core.codec_pipeline import ChunkTransform, evolve_codecs
+
+        index_spec = self._get_index_chunk_spec(chunks_per_shard)
+        return ChunkTransform(codecs=evolve_codecs(self.index_codecs, index_spec))
+
+    def _decode_shard_index_sync(
+        self, index_bytes: Buffer, chunks_per_shard: tuple[int, ...]
+    ) -> _ShardIndex:
+        """Decode shard index synchronously using ChunkTransform."""
+        index_transform = self._get_index_chunk_transform(chunks_per_shard)
+        index_spec = self._get_index_chunk_spec(chunks_per_shard)
+        index_array = index_transform.decode_chunk(index_bytes, index_spec)
+        return _ShardIndex(chunks_per_shard, index_array.as_numpy_array())
+
+    def _encode_shard_index_sync(self, index: _ShardIndex) -> Buffer:
+        """Encode shard index synchronously using ChunkTransform."""
+        index_transform = self._get_index_chunk_transform(index.chunks_per_shard)
+        index_spec = self._get_index_chunk_spec(index.chunks_per_shard)
+        index_nd = get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths)
+        result: Buffer | None = index_transform.encode_chunk(index_nd, index_spec)
+        assert result is not None
+        return result
+
+    def _shard_reader_from_bytes_sync(
+        self, buf: Buffer, chunks_per_shard: tuple[int, ...]
+    ) -> _ShardReader:
+        """Sync version of _ShardReader.from_bytes."""
+        shard_index_size = self._shard_index_size(chunks_per_shard)
+        if self.index_location == ShardingCodecIndexLocation.start:
+            shard_index_bytes = buf[:shard_index_size]
+        else:
+            shard_index_bytes = buf[-shard_index_size:]
+        index = self._decode_shard_index_sync(shard_index_bytes, chunks_per_shard)
+        reader = _ShardReader()
+        reader.buf = buf
+        reader.index = index
+        return reader
+
+    def _decode_sync(
+        self,
+        shard_bytes: Buffer,
+        shard_spec: ArraySpec,
+    ) -> NDBuffer:
+        """Decode a full shard synchronously.
+
+        Sync counterpart to `_decode_single`. Same semantics (decode every
+        inner chunk and assemble the full shard array) but routes through
+        `ChunkTransform` instead of the async codec pipeline, so it can
+        run on the sync codec-pipeline fast path without an event loop.
+
+        For a partial read where the caller only needs a slice of the shard,
+        use `_decode_partial_sync` instead — it fetches only the byte
+        ranges that overlap the selection.
+
+        This method does not parallelize decompression, but should.
+        See TODO: make issue for handling subchunk parallelism
+        """
+        shard_shape = shard_spec.shape
+        chunk_shape = self.chunk_shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = BasicIndexer(
+            tuple(slice(0, s) for s in shard_shape),
+            shape=shard_shape,
+            chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape),
+        )
+
+        out = chunk_spec.prototype.nd_buffer.empty(
+            shape=shard_shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        shard_dict = self._shard_reader_from_bytes_sync(shard_bytes, chunks_per_shard)
+
+        if shard_dict.index.is_all_empty():
+            out.fill(shard_spec.fill_value)
+            return out
+
+        from zarr.core.codec_pipeline import decode_and_scatter_chunk
+
+        for chunk_coords, chunk_selection, out_selection, _ in indexer:
+            # the GetResult status is discarded: missing INNER chunks of a
+            # present shard always fill (read_missing_chunks is a store-key
+            # level promise, applied to top-level statuses at the array layer)
+            decode_and_scatter_chunk(
+                shard_dict.get(chunk_coords),
+                out,
+                chunk_spec=chunk_spec,
+                chunk_selection=chunk_selection,
+                out_selection=out_selection,
+                drop_axes=(),
+                decode=inner_transform.decode_chunk,
+            )
+
+        return out
+
+    def _encode_sync(
+        self,
+        shard_array: NDBuffer,
+        shard_spec: ArraySpec,
+    ) -> Buffer | None:
+        """Encode a full shard synchronously.
+
+        Sync counterpart to `_encode_single`. This is reached when a
+        `ShardingCodec` is an *inner* codec of another sharding codec (nested
+        sharding): the outer codec encodes each inner chunk through its
+        `ChunkTransform`, which calls this method on the inner `ShardingCodec`.
+
+        Each inner chunk is encoded through the inner `ChunkTransform` and
+        collected into an intermediate `dict`. The dict's key order is
+        immaterial — the physical on-disk layout is decided downstream by the
+        `subchunk_write_order` loop in `_encode_shard_dict_sync` (this method
+        does NOT impose a layout). Empty inner chunks become `None` entries when
+        `write_empty_chunks` is False, signalling `_encode_shard_dict_sync` to
+        elide them from the data section and mark them empty in the shard index.
+
+        Returns `None` if every inner chunk was elided (an all-empty shard) —
+        callers treat that as "delete the shard key".
+
+        This method does not parallelize compression, but should.
+        See TODO: make issue for handling subchunk parallelism
+
+        For a partial write that only touches some inner chunks, use
+        `_encode_partial_sync` instead.
+        """
+        shard_shape = shard_spec.shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = BasicIndexer(
+            tuple(slice(0, s) for s in shard_shape),
+            shape=shard_shape,
+            chunk_grid=ChunkGrid.from_sizes(shard_shape, self.chunk_shape),
+        )
+
+        from zarr.core.codec_pipeline import encode_or_elide_chunk
+
+        # Key order here is immaterial; _encode_shard_dict_sync lays the present
+        # chunks out in subchunk_write_order.
+        shard_builder: dict[tuple[int, ...], Buffer | None] = dict.fromkeys(
+            morton_order_iter(chunks_per_shard)
+        )
+
+        for chunk_coords, _chunk_selection, out_selection, _ in indexer:
+            # None = chunk normalized to missing (see encode_or_elide_chunk)
+            shard_builder[chunk_coords] = encode_or_elide_chunk(
+                shard_array[out_selection], chunk_spec, inner_transform.encode_chunk
+            )
+
+        return self._encode_shard_dict_sync(
+            shard_builder,
+            chunks_per_shard=chunks_per_shard,
+            buffer_prototype=default_buffer_prototype(),
+        )
+
+    def _encode_partial_sync(
+        self,
+        byte_setter: Any,
+        value: NDBuffer,
+        selection: SelectorTuple,
+        shard_spec: ArraySpec,
+    ) -> None:
+        """Sync equivalent of `_encode_partial_single`.
+
+        Receives the source data for the written region (not a pre-merged
+        shard array) and the selection within the shard, matching the
+        calling convention of the async partial-encode path used by
+        `BatchedCodecPipeline`.
+
+        This method does not parallelize compression, but should.
+        See TODO: make issue for handling subchunk parallelism
+
+        Loads the existing shard, merges the written region into the affected
+        inner chunks, and rewrites the whole shard.
+        """
+        shard_shape = shard_spec.shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = list(
+            get_indexer(
+                selection,
+                shape=shard_shape,
+                chunk_grid=ChunkGrid.from_sizes(shard_shape, self.chunk_shape),
+            )
+        )
+
+        is_complete = self._is_complete_shard_write(indexer, chunks_per_shard)
+
+        is_scalar = len(value.shape) == 0
+
+        # Load existing inner-chunk bytes into a dict (same structure as
+        # the async path's shard_dict).
+        if is_complete:
+            shard_dict: dict[tuple[int, ...], Buffer | None] = dict.fromkeys(
+                morton_order_iter(chunks_per_shard)
+            )
+        else:
+            existing_bytes = byte_setter.get_sync(prototype=shard_spec.prototype)
+            if existing_bytes is not None:
+                shard_reader_fb = self._shard_reader_from_bytes_sync(
+                    existing_bytes, chunks_per_shard
+                )
+                # Build the dict with one vectorized index lookup over all chunks,
+                # matching the async _encode_partial_single path. A per-coordinate
+                # __getitem__ loop here is O(n_chunks) Python overhead that dominates
+                # partial writes into shards with many inner chunks.
+                shard_dict = shard_reader_fb.to_dict_vectorized(
+                    np.indices(chunks_per_shard).reshape(len(chunks_per_shard), -1).T
+                )
+            else:
+                shard_dict = dict.fromkeys(morton_order_iter(chunks_per_shard))
+
+        from zarr.core.codec_pipeline import merge_and_encode_chunk
+
+        # Merge, encode, and store each affected inner chunk into shard_dict via
+        # the canonical merge_and_encode_chunk (None = normalized to missing).
+        #
+        # Scalar fast path: when the written value is a scalar broadcast, every
+        # *complete* inner chunk is byte-for-byte identical — same fill, same
+        # empty-check, same encoded bytes. Compute that outcome once and reuse it
+        # for all complete chunks instead of re-merging, re-checking, and
+        # re-encoding tens of thousands of identical chunks. Incomplete (edge)
+        # chunks still merge against their own existing data individually.
+        # `_sentinel` distinguishes "not computed yet" from a memoized `None`
+        # (an empty chunk).
+        _sentinel = object()
+        scalar_complete_result: Buffer | None | object = _sentinel
+
+        for chunk_coords, chunk_sel, out_sel, is_complete_chunk in indexer:
+            if is_scalar and is_complete_chunk:
+                if scalar_complete_result is _sentinel:
+                    scalar_complete_result = merge_and_encode_chunk(
+                        None,
+                        value,
+                        chunk_spec=chunk_spec,
+                        chunk_selection=chunk_sel,
+                        out_selection=out_sel,
+                        is_complete=is_complete_chunk,
+                        drop_axes=(),
+                        decode=inner_transform.decode_chunk,
+                        encode=inner_transform.encode_chunk,
+                    )
+                shard_dict[chunk_coords] = scalar_complete_result  # type: ignore[assignment]
+                continue
+
+            # A complete chunk fully overwrites: skip decoding what it replaces.
+            existing_raw = None if is_complete_chunk else shard_dict.get(chunk_coords)
+            shard_dict[chunk_coords] = merge_and_encode_chunk(
+                existing_raw,
+                value,
+                chunk_spec=chunk_spec,
+                chunk_selection=chunk_sel,
+                out_selection=out_sel,
+                is_complete=is_complete_chunk,
+                drop_axes=(),
+                decode=inner_transform.decode_chunk,
+                encode=inner_transform.encode_chunk,
+            )
+
+        blob = self._encode_shard_dict_sync(
+            shard_dict,
+            chunks_per_shard=chunks_per_shard,
+            buffer_prototype=default_buffer_prototype(),
+        )
+        if blob is None:
+            byte_setter.delete_sync()
+        else:
+            byte_setter.set_sync(blob)
+
+    def _build_shard_layout(
+        self,
+        shard_dict: ShardMapping,
+        chunks_per_shard: tuple[int, ...],
+    ) -> tuple[_ShardIndex, list[Buffer]] | None:
+        """Lay out the present inner chunks of a shard. Pure compute, no IO.
+
+        Packs the encoded inner chunks (in the codec's `subchunk_write_order`)
+        into a contiguous data section and builds a shard index pointing each
+        present chunk at its ABSOLUTE byte offset within the final blob: when
+        the index is stored at the start, offsets are pre-shifted by the index
+        size (known without encoding — the index codecs are fixed-size, which
+        every index read path already relies on), so the index can be encoded
+        exactly once by the caller.
+
+        Returns `(index, data_buffers)`, or `None` for an all-empty shard (no
+        chunks present). Shared by the sync and async `_encode_shard_dict*` so
+        the layout/offset logic cannot drift between them.
+        """
+        index = _ShardIndex.create_empty(chunks_per_shard)
+        buffers: list[Buffer] = []
+        chunk_start = (
+            self._shard_index_size(chunks_per_shard)
+            if self.index_location == ShardingCodecIndexLocation.start
+            else 0
+        )
+
+        for chunk_coords in self._subchunk_order_iter(chunks_per_shard, self.subchunk_write_order):
+            value = shard_dict.get(chunk_coords)
+            if value is None or len(value) == 0:
+                continue
+            chunk_length = len(value)
+            buffers.append(value)
+            index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length))
+            chunk_start += chunk_length
+
+        if len(buffers) == 0:
+            return None
+        return index, buffers
+
+    def _assemble_shard(
+        self,
+        index_bytes: Buffer,
+        buffers: list[Buffer],
+        buffer_prototype: BufferPrototype,
+        *,
+        chunks_per_shard: tuple[int, ...],
+    ) -> Buffer:
+        """Concatenate the encoded index and data buffers into the shard blob.
+
+        The layout from `_build_shard_layout` already assumes the index size, so
+        the encoded index length must match `_shard_index_size` exactly — guard
+        that assumption rather than silently corrupt offsets. The guard lives
+        here, once, for both the sync and async encode paths.
+        """
+        if len(index_bytes) != self._shard_index_size(chunks_per_shard):
+            raise RuntimeError(
+                "encoded shard index size does not match _shard_index_size; "
+                "variable-size index codecs are not supported"
+            )
+        if self.index_location == ShardingCodecIndexLocation.start:
+            buffers.insert(0, index_bytes)
+        else:
+            buffers.append(index_bytes)
+        template = buffer_prototype.buffer.create_zero_length()
+        return template.combine(buffers)
+
+    def _encode_shard_dict_sync(
+        self,
+        shard_dict: ShardMapping,
+        chunks_per_shard: tuple[int, ...],
+        buffer_prototype: BufferPrototype,
+    ) -> Buffer | None:
+        """Sync version of _encode_shard_dict.
+
+        Layout via the shared `_build_shard_layout` (offsets already absolute),
+        then a single index encode and concatenation.
+
+        Returns `None` for an all-empty shard (no chunks present).
+        """
+        layout = self._build_shard_layout(shard_dict, chunks_per_shard)
+        if layout is None:
+            return None
+        index, buffers = layout
+        index_bytes = self._encode_shard_index_sync(index)
+        return self._assemble_shard(
+            index_bytes, buffers, buffer_prototype, chunks_per_shard=chunks_per_shard
+        )
+
     async def _decode_single(
         self,
         shard_bytes: Buffer,
@@ -445,7 +945,7 @@ async def _decode_single(
             return out
 
         # decoding chunks and writing them into the output buffer
-        await self.codec_pipeline.read(
+        await self._get_inner_pipeline(shard_spec).read(
             [
                 (
                     _ShardingByteGetter(shard_dict, chunk_coords),
@@ -511,7 +1011,7 @@ async def _decode_partial_single(
         shard_dict = shard_dict_maybe
 
         # decoding chunks and writing them into the output buffer
-        await self.codec_pipeline.read(
+        await self._get_inner_pipeline(shard_spec).read(
             [
                 (
                     _ShardingByteGetter(shard_dict, chunk_coords),
@@ -548,6 +1048,192 @@ def _subchunk_order_iter(
                 raise ValueError(f"Unrecognized subchunk write order: {subchunk_write_order!r}.")
         return subchunk_iter
 
+    def _decode_full_shard_bulk(
+        self,
+        shard_bytes: Buffer,
+        shard_spec: ArraySpec,
+        indexer: Any,
+    ) -> NDBuffer | None:
+        """Vectorized whole-shard decode for dense, fixed-size, uncompressed shards.
+
+        Returns the assembled shard array, or None if the fast path does not
+        apply (so the caller falls back to the per-chunk loop). Conditions:
+        - inner codec chain is fixed-size (no compression / variable-length);
+        - the inner codec chain is exactly a single BytesCodec — decode is a
+          dtype/endian view with no reordering. A trailing crc32c is NOT accepted
+          (the bulk path can't verify per-chunk checksums, so crc shards keep the
+          per-chunk path's corruption detection);
+        - the stored index is dense (every chunk present, equal fixed length,
+          contiguous) so the data section is a regular grid of chunk payloads.
+
+        Chunk positions are read from the stored index, so this is correct for
+        any `subchunk_write_order` (morton / lexicographic / colexicographic /
+        unordered). The on-disk byte order is taken from the BytesCodec's
+        `endian`, so big- and little-endian shards both decode correctly.
+        """
+        # --- gate on a trivial, fixed-size inner codec chain ---
+        if not self._inner_codecs_fixed_size:
+            return None
+        # The inner chain must be exactly a single BytesCodec (a dtype/endian
+        # view, no reordering). A trailing Crc32cCodec is excluded on purpose:
+        # the bulk path would have to strip-and-discard the per-chunk checksum
+        # bytes, silently dropping the corruption detection the per-chunk path
+        # enforces (Crc32cCodec._decode_sync raises on mismatch). crc-protected
+        # shards therefore fall through to the per-chunk path.
+        if len(self.codecs) != 1 or not isinstance(self.codecs[0], BytesCodec):
+            return None
+        ab_codec = self.codecs[0]
+
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        n_chunks = product(chunks_per_shard)
+        if n_chunks == 0:
+            return None
+
+        # Only valid when the read is the ENTIRE shard contiguously (output shape
+        # equals the shard shape). A strided/fancy read may touch all chunks but
+        # not want the whole grid laid out densely — those must use the per-chunk
+        # path so chunk_selection / out_selection are honored.
+        if tuple(indexer.shape) != tuple(shard_spec.shape):
+            return None
+        chunk_byte_length = self._inner_chunk_byte_length(chunk_spec)
+
+        shard_index_size = self._shard_index_size(chunks_per_shard)
+        if len(shard_bytes) != n_chunks * chunk_byte_length + shard_index_size:
+            return None  # not a dense fixed-size shard
+
+        # --- decode the index; require dense layout ---
+        if self.index_location == ShardingCodecIndexLocation.start:
+            index_bytes = shard_bytes[:shard_index_size]
+        else:
+            index_bytes = shard_bytes[-shard_index_size:]
+        index = self._decode_shard_index_sync(index_bytes, chunks_per_shard)
+        if not index.is_dense(chunk_byte_length):
+            return None
+
+        # --- bulk reconstruct ---
+        # The index gives each chunk's absolute byte offset within the blob; with
+        # a dense, crc-free, fixed-size layout the payload length is exactly the
+        # encoded item-bytes of one chunk.
+        native_dtype = shard_spec.dtype.to_native_dtype()
+        raw = shard_bytes.as_numpy_array().view(np.uint8)
+        payload = chunk_byte_length
+        cs = self.chunk_shape
+
+        # On-disk byte order is carried by the BytesCodec's `endian`, NOT by the
+        # data type (zarr v3). Build the read-view dtype from the codec's endian
+        # exactly as BytesCodec._decode_sync does, so a big-endian shard read on a
+        # little-endian host (or vice versa) is interpreted correctly. Assigning
+        # into the native-dtype `out` then performs any needed byteswap.
+        endian_str = ab_codec.endian.value if ab_codec.endian is not None else None
+        if isinstance(chunk_spec.dtype, HasEndianness):
+            stored_dtype = replace(chunk_spec.dtype, endianness=endian_str).to_native_dtype()  # type: ignore[call-arg]
+        else:
+            stored_dtype = chunk_spec.dtype.to_native_dtype()
+
+        offsets = index.offsets_and_lengths[..., 0].reshape(-1)  # localized coords, C-order
+        coords_c = list(np.ndindex(chunks_per_shard))
+        out = shard_spec.prototype.nd_buffer.empty(
+            shape=indexer.shape, dtype=native_dtype, order=shard_spec.order
+        )
+        for flat, coord in enumerate(coords_c):
+            start = int(offsets[flat])
+            chunk = raw[start : start + payload].view(stored_dtype).reshape(cs)
+            sel = tuple(slice(c * s, c * s + s) for c, s in zip(coord, cs, strict=True))
+            out[sel] = chunk
+        return out
+
+    def _decode_partial_sync(
+        self,
+        byte_getter: Any,
+        selection: SelectorTuple,
+        shard_spec: ArraySpec,
+    ) -> NDBuffer | None:
+        """Sync equivalent of `_decode_partial_single`.
+
+        Reads only the inner-chunk byte ranges that overlap `selection`
+        (plus the shard index) and decodes them through the inner codec
+        chain.  The store must support `get_sync` with byte ranges.
+
+        This method does not parallelize decompression, but should.
+        See TODO: make issue for handling subchunk parallelism
+
+        Two sub-paths:
+        - If `selection` covers the entire shard, just fetch the whole
+          blob — that's strictly cheaper than two round trips (index, then
+          data) plus the per-chunk overhead of partial fetches.
+        - Otherwise fetch the index alone, look up only the byte slices of
+          the inner chunks the selection touches, fetch those, and decode.
+        """
+        shard_shape = shard_spec.shape
+        chunk_shape = self.chunk_shape
+        chunks_per_shard = self._get_chunks_per_shard(shard_spec)
+        chunk_spec = self._get_chunk_spec(shard_spec)
+        inner_transform = self._get_inner_chunk_transform(shard_spec)
+
+        indexer = get_indexer(
+            selection,
+            shape=shard_shape,
+            chunk_grid=ChunkGrid.from_sizes(shard_shape, chunk_shape),
+        )
+
+        out = shard_spec.prototype.nd_buffer.empty(
+            shape=indexer.shape,
+            dtype=shard_spec.dtype.to_native_dtype(),
+            order=shard_spec.order,
+        )
+
+        indexed_chunks = list(indexer)
+        all_chunk_coords = {chunk_coords for chunk_coords, *_ in indexed_chunks}
+
+        # Read just the inner chunks we need.
+        if self._is_total_shard(all_chunk_coords, chunks_per_shard):
+            shard_bytes = byte_getter.get_sync(prototype=chunk_spec.prototype)
+            if shard_bytes is None:
+                return None
+            # Bulk fast path: a whole-shard read of a dense, fixed-size shard
+            # (no compression/filters) is just the data section reshaped and
+            # reordered into the grid -- no per-chunk decode/scatter loop.
+            # Order-agnostic: chunk positions come from the stored index, so it
+            # is correct for any subchunk_write_order. Falls through on any
+            # mismatch (compression, partial shard, non-trivial inner codec).
+            bulk = self._decode_full_shard_bulk(shard_bytes, shard_spec, indexer)
+            if bulk is not None:
+                if hasattr(indexer, "sel_shape"):
+                    return bulk.reshape(indexer.sel_shape)
+                return bulk
+            shard_reader = self._shard_reader_from_bytes_sync(shard_bytes, chunks_per_shard)
+            shard_dict: ShardMapping = shard_reader
+        else:
+            # Partial read: fetch only the touched inner chunks, coalescing
+            # adjacent byte ranges (mirrors the async _load_partial_shard_maybe
+            # / #3004). Returns None if the shard is absent.
+            partial = self._load_partial_shard_maybe_sync(
+                byte_getter, chunk_spec.prototype, chunks_per_shard, all_chunk_coords
+            )
+            if partial is None:
+                return None
+            shard_dict = partial
+
+        from zarr.core.codec_pipeline import decode_and_scatter_chunk
+
+        # Decode each needed inner chunk and scatter into out (statuses
+        # discarded: missing inner chunks fill, see _decode_sync).
+        for chunk_coords, chunk_selection, out_selection, _ in indexed_chunks:
+            decode_and_scatter_chunk(
+                shard_dict.get(chunk_coords),
+                out,
+                chunk_spec=chunk_spec,
+                chunk_selection=chunk_selection,
+                out_selection=out_selection,
+                drop_axes=(),
+                decode=inner_transform.decode_chunk,
+            )
+
+        if hasattr(indexer, "sel_shape"):
+            return out.reshape(indexer.sel_shape)
+        return out
+
     async def _encode_single(
         self,
         shard_array: NDBuffer,
@@ -569,7 +1255,7 @@ async def _encode_single(
         # decided later by the `subchunk_write_order` loop in `_encode_shard_dict`.
         shard_builder = dict.fromkeys(np.ndindex(chunks_per_shard))
 
-        await self.codec_pipeline.write(
+        await self._get_inner_pipeline(shard_spec).write(
             [
                 (
                     _ShardingByteSetter(shard_builder, chunk_coords),
@@ -621,10 +1307,10 @@ async def _encode_partial_single(
             shard_reader = shard_reader or _ShardReader.create_empty(chunks_per_shard)
             # Use vectorized lookup for better performance
             shard_dict = shard_reader.to_dict_vectorized(
-                np.array(list(np.ndindex(chunks_per_shard)))
+                np.indices(chunks_per_shard).reshape(len(chunks_per_shard), -1).T
             )
 
-        await self.codec_pipeline.write(
+        await self._get_inner_pipeline(shard_spec).write(
             [
                 (
                     _ShardingByteSetter(shard_dict, chunk_coords),
@@ -654,40 +1340,17 @@ async def _encode_shard_dict(
         chunks_per_shard: tuple[int, ...],
         buffer_prototype: BufferPrototype,
     ) -> Buffer | None:
-        index = _ShardIndex.create_empty(chunks_per_shard)
-
-        buffers = []
-
-        template = buffer_prototype.buffer.create_zero_length()
-        chunk_start = 0
-        for chunk_coords in self._subchunk_order_iter(chunks_per_shard, self.subchunk_write_order):
-            value = map.get(chunk_coords)
-            if value is None:
-                continue
-
-            if len(value) == 0:
-                continue
-
-            chunk_length = len(value)
-            buffers.append(value)
-            index.set_chunk_slice(chunk_coords, slice(chunk_start, chunk_start + chunk_length))
-            chunk_start += chunk_length
-
-        if len(buffers) == 0:
+        """Layout via the shared `_build_shard_layout` (offsets already
+        absolute), then a single index encode and concatenation. Async twin of
+        `_encode_shard_dict_sync`."""
+        layout = self._build_shard_layout(map, chunks_per_shard)
+        if layout is None:
             return None
-
+        index, buffers = layout
         index_bytes = await self._encode_shard_index(index)
-        if self.index_location == ShardingCodecIndexLocation.start:
-            empty_chunks_mask = index.offsets_and_lengths[..., 0] == MAX_UINT_64
-            index.offsets_and_lengths[~empty_chunks_mask, 0] += len(index_bytes)
-            index_bytes = await self._encode_shard_index(
-                index
-            )  # encode again with corrected offsets
-            buffers.insert(0, index_bytes)
-        else:
-            buffers.append(index_bytes)
-
-        return template.combine(buffers)
+        return self._assemble_shard(
+            index_bytes, buffers, buffer_prototype, chunks_per_shard=chunks_per_shard
+        )
 
     def _is_total_shard(
         self, all_chunk_coords: set[tuple[int, ...]], chunks_per_shard: tuple[int, ...]
@@ -706,23 +1369,35 @@ def _is_complete_shard_write(
             is_complete_chunk for *_, is_complete_chunk in indexed_chunks
         )
 
+    def _index_codecs_sync_capable(self) -> bool:
+        return all(isinstance(c, SupportsSyncCodec) for c in self.index_codecs)
+
     async def _decode_shard_index(
         self, index_bytes: Buffer, chunks_per_shard: tuple[int, ...]
     ) -> _ShardIndex:
+        # Pure compute (the bytes are already in hand): delegate to the sync
+        # implementation instead of spinning up a pipeline + per-call
+        # AsyncChunkTransform for a tiny fixed-size decode. The default
+        # (bytes + crc32c) index chain is sync-capable; an async-only
+        # third-party index codec falls back to the full async pipeline, which
+        # the synchronous read paths cannot use but this async path still can.
+        if self._index_codecs_sync_capable():
+            return self._decode_shard_index_sync(index_bytes, chunks_per_shard)
         index_array = next(
             iter(
                 await get_pipeline_class()
                 .from_codecs(self.index_codecs)
-                .decode(
-                    [(index_bytes, self._get_index_chunk_spec(chunks_per_shard))],
-                )
+                .decode([(index_bytes, self._get_index_chunk_spec(chunks_per_shard))])
             )
         )
-        # This cannot be None because we have the bytes already
-        index_array = cast(NDBuffer, index_array)
+        assert index_array is not None  # the bytes are already in hand
         return _ShardIndex(chunks_per_shard, index_array.as_numpy_array())
 
     async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
+        # Pure compute: delegate to the sync implementation, with the same
+        # async-pipeline fallback as _decode_shard_index.
+        if self._index_codecs_sync_capable():
+            return self._encode_shard_index_sync(index)
         index_bytes = next(
             iter(
                 await get_pipeline_class()
@@ -733,12 +1408,11 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
                             get_ndbuffer_class().from_numpy_array(index.offsets_and_lengths),
                             self._get_index_chunk_spec(index.chunks_per_shard),
                         )
-                    ],
+                    ]
                 )
             )
         )
         assert index_bytes is not None
-        assert isinstance(index_bytes, Buffer)
         return index_bytes
 
     def _shard_index_size(self, chunks_per_shard: tuple[int, ...]) -> int:
@@ -780,30 +1454,48 @@ def _get_chunks_per_shard(self, shard_spec: ArraySpec) -> tuple[int, ...]:
             )
         )
 
+    def _shard_index_byte_range(
+        self, chunks_per_shard: tuple[int, ...]
+    ) -> RangeByteRequest | SuffixByteRequest:
+        """Byte range of the shard index within the shard blob.
+
+        Single source of truth for the index-location arithmetic, shared by the
+        sync and async index loaders so they cannot drift.
+        """
+        shard_index_size = self._shard_index_size(chunks_per_shard)
+        if self.index_location == ShardingCodecIndexLocation.start:
+            return RangeByteRequest(0, shard_index_size)
+        return SuffixByteRequest(shard_index_size)
+
+    @staticmethod
+    def _pair_chunks_with_byte_ranges(
+        shard_index: _ShardIndex, all_chunk_coords: set[tuple[int, ...]]
+    ) -> list[tuple[tuple[int, ...], RangeByteRequest]]:
+        """Pair each requested chunk coord with its byte range in the shard.
+
+        Coords whose chunk is absent from the index are omitted. Shared by the
+        sync and async partial-shard loaders.
+        """
+        chunk_coord_byte_ranges: list[tuple[tuple[int, ...], RangeByteRequest]] = []
+        for chunk_coord in all_chunk_coords:
+            chunk_byte_slice = shard_index.get_chunk_slice(chunk_coord)
+            if chunk_byte_slice is not None:
+                chunk_coord_byte_ranges.append(
+                    (chunk_coord, RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]))
+                )
+        return chunk_coord_byte_ranges
+
     async def _load_shard_index_maybe(
         self, byte_getter: ByteGetter, chunks_per_shard: tuple[int, ...]
     ) -> _ShardIndex | None:
-        shard_index_size = self._shard_index_size(chunks_per_shard)
-        if self.index_location == ShardingCodecIndexLocation.start:
-            index_bytes = await byte_getter.get(
-                prototype=numpy_buffer_prototype(),
-                byte_range=RangeByteRequest(0, shard_index_size),
-            )
-        else:
-            index_bytes = await byte_getter.get(
-                prototype=numpy_buffer_prototype(), byte_range=SuffixByteRequest(shard_index_size)
-            )
+        index_bytes = await byte_getter.get(
+            prototype=numpy_buffer_prototype(),
+            byte_range=self._shard_index_byte_range(chunks_per_shard),
+        )
         if index_bytes is not None:
             return await self._decode_shard_index(index_bytes, chunks_per_shard)
         return None
 
-    async def _load_shard_index(
-        self, byte_getter: ByteGetter, chunks_per_shard: tuple[int, ...]
-    ) -> _ShardIndex:
-        return (
-            await self._load_shard_index_maybe(byte_getter, chunks_per_shard)
-        ) or _ShardIndex.create_empty(chunks_per_shard)
-
     async def _load_full_shard_maybe(
         self, byte_getter: ByteGetter, prototype: BufferPrototype, chunks_per_shard: tuple[int, ...]
     ) -> _ShardReader | None:
@@ -815,6 +1507,19 @@ async def _load_full_shard_maybe(
             else None
         )
 
+    @property
+    def _inner_codecs_fixed_size(self) -> bool:
+        """True when all inner codecs produce fixed-size output (no compression)."""
+        return all(c.is_fixed_size for c in self.codecs)
+
+    def _inner_chunk_byte_length(self, chunk_spec: ArraySpec) -> int:
+        """Encoded byte length of a single inner chunk. Only valid when _inner_codecs_fixed_size."""
+        raw_byte_length = 1
+        for s in self.chunk_shape:
+            raw_byte_length *= s
+        raw_byte_length *= chunk_spec.dtype.item_size  # type: ignore[attr-defined]
+        return int(self.codec_pipeline.compute_encoded_size(raw_byte_length, chunk_spec))
+
     async def _load_partial_shard_maybe(
         self,
         byte_getter: ByteGetter,
@@ -830,14 +1535,7 @@ async def _load_partial_shard_maybe(
         if shard_index is None:
             return None
 
-        # Pair up chunks and their byte ranges as list[tuple[chunk_coord, byte_range]]
-        chunk_coord_byte_ranges: list[tuple[tuple[int, ...], RangeByteRequest]] = []
-        for chunk_coord in all_chunk_coords:
-            chunk_byte_slice = shard_index.get_chunk_slice(chunk_coord)
-            if chunk_byte_slice is not None:
-                chunk_coord_byte_ranges.append(
-                    (chunk_coord, RangeByteRequest(chunk_byte_slice[0], chunk_byte_slice[1]))
-                )
+        chunk_coord_byte_ranges = self._pair_chunks_with_byte_ranges(shard_index, all_chunk_coords)
 
         if not chunk_coord_byte_ranges:
             return {}
@@ -875,6 +1573,68 @@ async def _load_partial_shard_maybe(
 
         return shard_dict
 
+    def _load_shard_index_maybe_sync(
+        self, byte_getter: Any, chunks_per_shard: tuple[int, ...]
+    ) -> _ShardIndex | None:
+        """Sync counterpart of `_load_shard_index_maybe`."""
+        index_bytes = byte_getter.get_sync(
+            prototype=numpy_buffer_prototype(),
+            byte_range=self._shard_index_byte_range(chunks_per_shard),
+        )
+        if index_bytes is not None:
+            return self._decode_shard_index_sync(index_bytes, chunks_per_shard)
+        return None
+
+    def _load_partial_shard_maybe_sync(
+        self,
+        byte_getter: Any,
+        prototype: BufferPrototype,
+        chunks_per_shard: tuple[int, ...],
+        all_chunk_coords: set[tuple[int, ...]],
+    ) -> ShardMapping | None:
+        """Sync counterpart of `_load_partial_shard_maybe` (the #3004 read path).
+
+        Reads the shard index, then fetches only the touched inner chunks via the
+        store's coalescing `get_ranges_sync` (merging adjacent ranges into fewer
+        reads), matching the async path's IO shape without an event loop.
+        """
+        shard_index = self._load_shard_index_maybe_sync(byte_getter, chunks_per_shard)
+        if shard_index is None:
+            return None
+
+        chunk_coord_byte_ranges = self._pair_chunks_with_byte_ranges(shard_index, all_chunk_coords)
+
+        if not chunk_coord_byte_ranges:
+            return {}
+
+        shard_dict: ShardMutableMapping = {}
+        store = byte_getter.store if hasattr(byte_getter, "store") else None
+        if isinstance(store, Store) and isinstance(store, SupportsGetSync):
+            # External store: coalesce via get_ranges_sync (mirrors get_ranges).
+            byte_ranges = [byte_range for _, byte_range in chunk_coord_byte_ranges]
+            try:
+                for idx, buf in store.get_ranges_sync(
+                    byte_getter.path, byte_ranges, prototype=prototype
+                ):
+                    if buf is not None:
+                        chunk_coord, _ = chunk_coord_byte_ranges[idx]
+                        shard_dict[chunk_coord] = buf
+            except BaseExceptionGroup as eg:
+                # Mirror the async path: a FileNotFoundError means the shard was
+                # deleted mid-read -> treat as "gone" (None). Re-raise anything else.
+                _, rest = eg.split(FileNotFoundError)
+                if rest is not None:
+                    raise rest from None
+                return None
+        else:
+            # Nested sharding: an in-memory _ShardingByteGetter, no IO to coalesce.
+            for chunk_coord, byte_range in chunk_coord_byte_ranges:
+                buf = byte_getter.get_sync(prototype=prototype, byte_range=byte_range)
+                if buf is not None:
+                    shard_dict[chunk_coord] = buf
+
+        return shard_dict
+
     def compute_encoded_size(self, input_byte_length: int, shard_spec: ArraySpec) -> int:
         chunks_per_shard = self._get_chunks_per_shard(shard_spec)
         return input_byte_length + self._shard_index_size(chunks_per_shard)
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
index d15c70064b..bdf0bbe639 100644
--- a/src/zarr/core/array.py
+++ b/src/zarr/core/array.py
@@ -227,10 +227,35 @@ def create_codec_pipeline(metadata: ArrayMetadata, *, store: Store | None = None
             pass
 
     if isinstance(metadata, ArrayV3Metadata):
-        return get_pipeline_class().from_codecs(metadata.codecs)
+        pipeline = get_pipeline_class().from_codecs(metadata.codecs)
+        from zarr.core.metadata.v3 import RegularChunkGridMetadata
+
+        # Use the regular chunk shape if available, otherwise use a
+        # placeholder. The ChunkTransform is shape-agnostic — the actual
+        # chunk shape is passed per-call at decode/encode time.
+        if isinstance(metadata.chunk_grid, RegularChunkGridMetadata):
+            chunk_shape = metadata.chunk_grid.chunk_shape
+        else:
+            chunk_shape = (1,) * len(metadata.shape)
+        chunk_spec = ArraySpec(
+            shape=chunk_shape,
+            dtype=metadata.data_type,
+            fill_value=metadata.fill_value,
+            config=ArrayConfig.from_dict({}),
+            prototype=default_buffer_prototype(),
+        )
+        return pipeline.evolve_from_array_spec(chunk_spec)
     elif isinstance(metadata, ArrayV2Metadata):
         v2_codec = V2Codec(filters=metadata.filters, compressor=metadata.compressor)
-        return get_pipeline_class().from_codecs([v2_codec])
+        pipeline = get_pipeline_class().from_codecs([v2_codec])
+        chunk_spec = ArraySpec(
+            shape=metadata.chunks,
+            dtype=metadata.dtype,
+            fill_value=metadata.fill_value,
+            config=ArrayConfig.from_dict({"order": metadata.order}),
+            prototype=default_buffer_prototype(),
+        )
+        return pipeline.evolve_from_array_spec(chunk_spec)
     raise TypeError  # pragma: no cover
 
 
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 5c26681d6b..b8cff3c55c 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -1,8 +1,11 @@
 from __future__ import annotations
 
+import asyncio
+import threading
+from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
 from itertools import islice, pairwise
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 from warnings import warn
 
 from zarr.abc.codec import (
@@ -23,7 +26,7 @@
 from zarr.registry import register_pipeline
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Iterator
+    from collections.abc import Callable, Iterable, Iterator, Sequence
     from typing import Self
 
     from zarr.abc.store import ByteGetter, ByteSetter
@@ -33,6 +36,73 @@
     from zarr.core.metadata.v3 import ChunkGridMetadata
 
 
+_pool: ThreadPoolExecutor | None = None
+_pool_size: int = 0
+_pool_lock = threading.Lock()
+
+
+def _resolve_max_workers() -> int:
+    """Resolve `codec_pipeline.max_workers` config to an effective worker count.
+
+    The default is `1` (sequential, no thread pool). `None` means "auto" →
+    `os.cpu_count()` (or 1 if unavailable). Values < 1 are clamped to 1.
+
+    Notes
+    -----
+    Threading is opt-in. The default is sequential because parallelism here is
+    not universally a win and carries downstream risk: enabling it runs custom
+    stores/codecs concurrently, and on many-core nodes a pool sized to
+    `cpu_count` can oversubscribe workloads that already parallelize at a higher
+    level (dask, MPI). It also slows small chunks (≲ 64 KB) by 1.5-3x, where the
+    per-task pool overhead (≈ 30-50 µs submit + worker handoff) outweighs the
+    work.
+
+    For large chunks (≳ 1 MB encoded) where per-chunk decode + scatter is real
+    work, threading helps; opt in with an explicit positive count, or `None` for
+    auto (`os.cpu_count()`):
+
+        zarr.config.set({"codec_pipeline.max_workers": 8})
+        zarr.config.set({"codec_pipeline.max_workers": None})  # auto -> cpu_count
+
+    Approximate breakeven on uncompressed reads:
+    256-512 KB per chunk; compressed chunks shift it lower because decode is real
+    CPU work that benefits from parallelism.
+    """
+    import os as _os
+
+    cfg = config.get("codec_pipeline.max_workers", default=None)
+    if cfg is None:
+        return _os.cpu_count() or 1
+    return max(1, int(cfg))
+
+
+def _get_pool(max_workers: int) -> ThreadPoolExecutor:
+    """Get or create the module-level thread pool, sized to `max_workers`.
+
+    The pool grows on demand — if a request arrives for more workers than the
+    current pool has, it is replaced with a larger one. The previous pool is NOT
+    shut down here: another thread may be holding a reference to it and about to
+    submit (`shutdown` would make its `pool.map` raise "cannot schedule new
+    futures after shutdown"). The orphaned pool finishes its in-flight tasks and
+    is garbage-collected once no caller references it. The pool only grows, never
+    shrinks (a shrink request reuses the larger pool, leaving workers idle).
+
+    Callers that want sequential execution should not call this — they
+    should run the task list inline. `max_workers` must be >= 1.
+    """
+    global _pool, _pool_size
+    if max_workers < 1:
+        raise ValueError(f"max_workers must be >= 1, got {max_workers}")
+    if _pool is None or _pool_size < max_workers:
+        with _pool_lock:
+            if _pool is None or _pool_size < max_workers:
+                # Replace without shutting down the old pool (see docstring):
+                # avoids a race with a concurrent in-flight pool.map on it.
+                _pool = ThreadPoolExecutor(max_workers=max_workers)
+                _pool_size = max_workers
+    return _pool
+
+
 def _unzip2[T, U](iterable: Iterable[tuple[T, U]]) -> tuple[list[T], list[U]]:
     out0: list[T] = []
     out1: list[U] = []
@@ -54,6 +124,101 @@ def resolve_batched(codec: Codec, chunk_specs: Iterable[ArraySpec]) -> Iterable[
     return [codec.resolve_metadata(chunk_spec) for chunk_spec in chunk_specs]
 
 
+def resolve_aa_specs(
+    aa_codecs: tuple[Codec, ...], chunk_spec: ArraySpec
+) -> tuple[tuple[ArraySpec, ...], ArraySpec]:
+    """Resolve the per-stage chunk specs for a single chunk's codec chain.
+
+    Threads `chunk_spec` forward through the array->array codecs via
+    `resolve_metadata` (each codec sees the spec produced by the previous one),
+    returning `(aa_specs, ab_spec)`:
+
+    * `aa_specs[i]` is the spec the i-th AA codec operates on (its *input* on
+      encode / *output* on decode);
+    * `ab_spec` is the spec after all AA codecs — what the array->bytes codec
+      and the bytes->bytes codecs operate on.
+
+    This is the single source of truth for per-stage spec evolution, shared by
+    the synchronous `ChunkTransform` and the asynchronous
+    `AsyncChunkTransform`. It is pure metadata (only `resolve_metadata`), so
+    it places no synchronous-codec requirement on the codecs.
+    """
+    aa_specs: list[ArraySpec] = []
+    spec = chunk_spec
+    for aa_codec in aa_codecs:
+        aa_specs.append(spec)
+        spec = aa_codec.resolve_metadata(spec)
+    return tuple(aa_specs), spec
+
+
+def evolve_codecs(codecs: Iterable[Codec], array_spec: ArraySpec) -> tuple[Codec, ...]:
+    """Evolve a codec chain against ``array_spec``, threading the spec forward.
+
+    Each codec is evolved against the spec produced by the previous one — NOT
+    the original ``array_spec`` — because earlier array->array codecs may
+    transform the chunk spec (e.g. ``cast_value`` widening int8 -> int16). A
+    later codec (notably the array->bytes serializer) must be evolved against
+    the spec it will actually see at run time; evolving every codec against the
+    unthreaded original spec would, for example, strip a ``BytesCodec``'s
+    ``endian`` (it sees the single-byte source dtype) and then fail at decode
+    time on the multi-byte target.
+
+    This is the single source of truth for pipeline-construction-time codec
+    evolution, shared by every ``CodecPipeline.evolve_from_array_spec``. (The
+    per-chunk decode/encode counterpart is ``resolve_aa_specs``.)
+    """
+    evolved: list[Codec] = []
+    spec = array_spec
+    for codec in codecs:
+        evolved_codec = codec.evolve_from_array_spec(array_spec=spec)
+        evolved.append(evolved_codec)
+        spec = evolved_codec.resolve_metadata(spec)
+    return tuple(evolved)
+
+
+def pipeline_supports_partial_decode(
+    array_bytes_codec: ArrayBytesCodec,
+    *,
+    array_array_codecs: tuple[ArrayArrayCodec, ...],
+    bytes_bytes_codecs: tuple[BytesBytesCodec, ...],
+    require_no_aa_bb: bool,
+) -> bool:
+    """Whether a codec pipeline can decode a partial selection without a full read.
+
+    Requires the array->bytes codec to implement
+    ``ArrayBytesCodecPartialDecodeMixin``. When ``require_no_aa_bb`` is True it
+    additionally requires no array->array / bytes->bytes codecs, because those
+    can change the slice<->byte-range correspondence (an AA codec can make the
+    selection non-contiguous, a BB codec can rewrite the bytes), making partial
+    decode infeasible.
+
+    NOTE: the two pipelines currently pass different ``require_no_aa_bb`` values
+    (Batched: True; Fused: False). That divergence is intentional-for-now and
+    tracked separately; this function centralizes the predicate without changing
+    either pipeline's behavior.
+    """
+    if require_no_aa_bb and (len(array_array_codecs) + len(bytes_bytes_codecs)) != 0:
+        return False
+    return isinstance(array_bytes_codec, ArrayBytesCodecPartialDecodeMixin)
+
+
+def pipeline_supports_partial_encode(
+    array_bytes_codec: ArrayBytesCodec,
+    *,
+    array_array_codecs: tuple[ArrayArrayCodec, ...],
+    bytes_bytes_codecs: tuple[BytesBytesCodec, ...],
+    require_no_aa_bb: bool,
+) -> bool:
+    """Whether a codec pipeline can encode a partial selection without a full rewrite.
+
+    Mirror of ``pipeline_supports_partial_decode`` for encoding. See its note re:
+    the per-pipeline ``require_no_aa_bb`` divergence.
+    """
+    if require_no_aa_bb and (len(array_array_codecs) + len(bytes_bytes_codecs)) != 0:
+        return False
+    return isinstance(array_bytes_codec, ArrayBytesCodecPartialEncodeMixin)
+
+
 def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
     fill_value = chunk_spec.fill_value
     if fill_value is None:
@@ -67,26 +232,355 @@ def fill_value_or_default(chunk_spec: ArraySpec) -> Any:
         return fill_value
 
 
+def _merge_chunk_array(
+    existing_chunk_array: NDBuffer | None,
+    value: NDBuffer,
+    out_selection: SelectorTuple,
+    chunk_spec: ArraySpec,
+    chunk_selection: SelectorTuple,
+    is_complete_chunk: bool,
+    drop_axes: tuple[int, ...],
+) -> NDBuffer:
+    """Merge `value` into a full-chunk-shaped NDBuffer at `chunk_selection`.
+
+    If `is_complete_chunk` and `value[out_selection]` is exactly chunk-shaped,
+    that VIEW of the caller's `value` is returned without copying — callers
+    (and the codecs they pass it to) must treat it as read-only, since
+    mutating it would corrupt the user's source array. Otherwise, a writable
+    buffer is materialized — either from `existing_chunk_array.copy()` if
+    one was read from the store, or freshly allocated and filled with the
+    chunk's fill value — and the relevant slice of `value` is written into it.
+    """
+    if is_complete_chunk and value.shape != ():
+        selected = value[out_selection]
+        # The shape check guards against a partial edge chunk arriving with
+        # is_complete_chunk=True, and against dropped axes (size-1 integer
+        # dims), where the selection is not exactly chunk-shaped.
+        if selected.shape == chunk_spec.shape:
+            return selected
+    if existing_chunk_array is None:
+        chunk_array = chunk_spec.prototype.nd_buffer.create(
+            shape=chunk_spec.shape,
+            dtype=chunk_spec.dtype.to_native_dtype(),
+            order=chunk_spec.order,
+            fill_value=fill_value_or_default(chunk_spec),
+        )
+    else:
+        chunk_array = existing_chunk_array.copy()
+    if chunk_selection == () or is_scalar(
+        value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
+    ):
+        chunk_value = value
+    else:
+        chunk_value = value[out_selection]
+        if drop_axes:
+            item = tuple(
+                None if idx in drop_axes else slice(None) for idx in range(chunk_spec.ndim)
+            )
+            chunk_value = chunk_value[item]
+    chunk_array[chunk_selection] = chunk_value
+    return chunk_array
+
+
+def chunk_is_empty(chunk_array: NDBuffer, chunk_spec: ArraySpec) -> bool:
+    """THE empty-chunk normalization rule, in one place.
+
+    With ``write_empty_chunks=False`` (the default), a chunk whose decoded
+    content equals the fill value normalizes to *missing*: it must not be
+    stored, and readers reconstruct it from the fill value. Every write path
+    (fused, async fallback, shard inner chunks) must apply this same rule —
+    scattering inline ``all_equal`` checks is how the rule drifts.
+    """
+    return not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
+        fill_value_or_default(chunk_spec)
+    )
+
+
+def encode_or_elide_chunk(
+    chunk_array: NDBuffer,
+    chunk_spec: ArraySpec,
+    encode: Callable[[NDBuffer, ArraySpec], Buffer | None],
+) -> Buffer | None:
+    """Encode a merged chunk, normalizing empties to missing.
+
+    Returns the bytes to store, or ``None`` meaning the chunk must NOT be
+    stored (either it normalized to empty per `chunk_is_empty`, or the codec
+    chain elided it). ``None`` is the single "missing" convention shared by
+    the chunk write paths and the shard dicts.
+    """
+    if chunk_is_empty(chunk_array, chunk_spec):
+        return None
+    return encode(chunk_array, chunk_spec)
+
+
+def merge_and_encode_chunk(
+    existing_bytes: Buffer | None,
+    value: NDBuffer,
+    *,
+    chunk_spec: ArraySpec,
+    chunk_selection: SelectorTuple,
+    out_selection: SelectorTuple,
+    is_complete: bool,
+    drop_axes: tuple[int, ...],
+    decode: Callable[[Buffer, ArraySpec], NDBuffer],
+    encode: Callable[[NDBuffer, ArraySpec], Buffer | None],
+) -> Buffer | None:
+    """The canonical single-chunk write: merge, normalize, encode.
+
+    decode existing (``None`` = chunk currently missing) -> merge ``value`` at
+    ``chunk_selection`` -> normalize empties to missing -> encode. Returns the
+    bytes to store or ``None`` = do not store / delete. This is the one
+    state-transition every per-chunk write path expresses; only the IO around
+    it (where ``existing_bytes`` comes from, where the result goes) differs.
+    """
+    existing_array = decode(existing_bytes, chunk_spec) if existing_bytes is not None else None
+    merged = _merge_chunk_array(
+        existing_array, value, out_selection, chunk_spec, chunk_selection, is_complete, drop_axes
+    )
+    return encode_or_elide_chunk(merged, chunk_spec, encode)
+
+
+def scatter_chunk(
+    selected: NDBuffer | None,
+    out: NDBuffer,
+    *,
+    chunk_spec: ArraySpec,
+    out_selection: SelectorTuple,
+    drop_axes: tuple[int, ...],
+) -> GetResult:
+    """Scatter one chunk's (already-selected) decoded region into ``out``.
+
+    ``None`` = the chunk is missing: the fill value is scattered instead and a
+    ``missing`` status is returned. POLICY-FREE by design: whether a missing
+    chunk is an error (``read_missing_chunks=False``) is decided by the array
+    layer from the returned statuses — which is also what makes missing INNER
+    chunks of a present shard fill rather than raise (the sharding codec
+    discards the nested read's statuses; only top-level statuses reach the
+    array layer).
+    """
+    if selected is None:
+        out[out_selection] = fill_value_or_default(chunk_spec)
+        return GetResult(status="missing")
+    if drop_axes:
+        selected = selected.squeeze(axis=drop_axes)
+    out[out_selection] = selected
+    return GetResult(status="present")
+
+
+def decode_and_scatter_chunk(
+    chunk_bytes: Buffer | None,
+    out: NDBuffer,
+    *,
+    chunk_spec: ArraySpec,
+    chunk_selection: SelectorTuple,
+    out_selection: SelectorTuple,
+    drop_axes: tuple[int, ...],
+    decode: Callable[[Buffer, ArraySpec], NDBuffer],
+) -> GetResult:
+    """The canonical single-chunk read: decode stored bytes (``None`` =
+    missing), select, and scatter into ``out`` via `scatter_chunk`. The read
+    twin of `merge_and_encode_chunk`.
+    """
+    if chunk_bytes is None:
+        return scatter_chunk(
+            None, out, chunk_spec=chunk_spec, out_selection=out_selection, drop_axes=drop_axes
+        )
+    selected = decode(chunk_bytes, chunk_spec)[chunk_selection]
+    return scatter_chunk(
+        selected, out, chunk_spec=chunk_spec, out_selection=out_selection, drop_axes=drop_axes
+    )
+
+
+async def _async_read_fallback(
+    pipeline: CodecPipeline,
+    batch: list[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+    out: NDBuffer,
+    drop_axes: tuple[int, ...],
+) -> tuple[GetResult, ...]:
+    """Async fallback read used when no fast-path is available.
+
+    Fetches every chunk's bytes via `concurrent_map` (sized by
+    `async.concurrency`), decodes the batch through `pipeline.decode`,
+    then scatters each decoded chunk into `out` at its `out_selection`.
+
+    Used by both `BatchedCodecPipeline.read_batch` (non-partial-decode
+    branch) and `FusedCodecPipeline.read` (when the store is not a
+    `SupportsGetSync` / sync transform is unavailable).
+    """
+    chunk_bytes_batch = await concurrent_map(
+        [(byte_getter, array_spec.prototype) for byte_getter, array_spec, *_ in batch],
+        lambda byte_getter, prototype: byte_getter.get(prototype),
+        config.get("async.concurrency"),
+    )
+    if isinstance(pipeline, FusedCodecPipeline) and pipeline.sync_transform is not None:
+        chunk_array_batch = await asyncio.to_thread(
+            pipeline.decode_sync,
+            [
+                (chunk_bytes, chunk_spec)
+                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+            ],
+        )
+    else:
+        chunk_array_batch = await pipeline.decode(
+            [
+                (chunk_bytes, chunk_spec)
+                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+            ],
+        )
+    results: list[GetResult] = []
+    for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
+        chunk_array_batch, batch, strict=False
+    ):
+        selected = None if chunk_array is None else chunk_array[chunk_selection]
+        results.append(
+            scatter_chunk(
+                selected,
+                out,
+                chunk_spec=chunk_spec,
+                out_selection=out_selection,
+                drop_axes=drop_axes,
+            )
+        )
+    return tuple(results)
+
+
+async def _async_write_fallback(
+    pipeline: CodecPipeline,
+    batch: list[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+    value: NDBuffer,
+    drop_axes: tuple[int, ...],
+) -> None:
+    """Async fallback write used when no fast-path is available.
+
+    For each chunk in `batch`: read its existing bytes from the store
+    (skipping the read for complete chunks), decode the batch via
+    `pipeline.decode`, merge `value` into each decoded chunk via
+    `_merge_chunk_array`, drop chunks that are all-fill when
+    `write_empty_chunks` is False, encode the surviving chunks via
+    `pipeline.encode`, then `set` the encoded bytes (or `delete`
+    if encoding produced `None` or the chunk dropped).
+
+    Used by both `BatchedCodecPipeline.write_batch` (non-partial-encode
+    branch) and `FusedCodecPipeline.write` (when the store is not a
+    `SupportsSetSync` / sync transform is unavailable).
+    """
+
+    async def _read_key(
+        byte_setter: ByteSetter | None, prototype: BufferPrototype
+    ) -> Buffer | None:
+        if byte_setter is None:
+            return None
+        return await byte_setter.get(prototype=prototype)
+
+    chunk_bytes_batch: Iterable[Buffer | None]
+    chunk_bytes_batch = await concurrent_map(
+        [
+            (
+                None if is_complete_chunk else byte_setter,
+                chunk_spec.prototype,
+            )
+            for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch
+        ],
+        _read_key,
+        config.get("async.concurrency"),
+    )
+
+    if use_sync := (
+        isinstance(pipeline, FusedCodecPipeline) and pipeline.sync_transform is not None
+    ):
+        chunk_array_decoded = await asyncio.to_thread(
+            pipeline.decode_sync,
+            [
+                (chunk_bytes, chunk_spec)
+                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+            ],
+        )
+    else:
+        chunk_array_decoded = await pipeline.decode(
+            [
+                (chunk_bytes, chunk_spec)
+                for chunk_bytes, (_, chunk_spec, *_) in zip(chunk_bytes_batch, batch, strict=False)
+            ],
+        )
+    chunk_array_merged = [
+        _merge_chunk_array(
+            chunk_array,
+            value,
+            out_selection,
+            chunk_spec,
+            chunk_selection,
+            is_complete_chunk,
+            drop_axes,
+        )
+        for chunk_array, (
+            _,
+            chunk_spec,
+            chunk_selection,
+            out_selection,
+            is_complete_chunk,
+        ) in zip(chunk_array_decoded, batch, strict=False)
+    ]
+    chunk_array_batch: list[NDBuffer | None] = []
+    for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_merged, batch, strict=False):
+        if chunk_array is None:
+            chunk_array_batch.append(None)  # type: ignore[unreachable]
+        else:
+            if chunk_is_empty(chunk_array, chunk_spec):
+                chunk_array_batch.append(None)
+            else:
+                chunk_array_batch.append(chunk_array)
+
+    if use_sync:
+        chunk_bytes_batch = await asyncio.to_thread(
+            cast(FusedCodecPipeline, pipeline).encode_sync,
+            [
+                (chunk_array, chunk_spec)
+                for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_batch, batch, strict=False)
+            ],
+        )
+    else:
+        chunk_bytes_batch = await pipeline.encode(
+            [
+                (chunk_array, chunk_spec)
+                for chunk_array, (_, chunk_spec, *_) in zip(chunk_array_batch, batch, strict=False)
+            ],
+        )
+
+    async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None:
+        if chunk_bytes is None:
+            await byte_setter.delete()
+        else:
+            await byte_setter.set(chunk_bytes)
+
+    await concurrent_map(
+        [
+            (byte_setter, chunk_bytes)
+            for chunk_bytes, (byte_setter, *_) in zip(chunk_bytes_batch, batch, strict=False)
+        ],
+        _write_key,
+        config.get("async.concurrency"),
+    )
+
+
 @dataclass(slots=True, kw_only=True)
 class ChunkTransform:
-    """A synchronous codec chain bound to an ArraySpec.
+    """A synchronous codec chain.
 
-    Provides `encode` and `decode` for pure-compute codec operations
-    (no IO, no threading, no batching).
+    Provides `encode_chunk` and `decode_chunk` for pure-compute codec
+    operations (no IO, no threading, no batching). The `chunk_spec` is
+    supplied per call so the same transform can be reused across chunks
+    with different shapes, prototypes, etc.
 
     All codecs must implement `SupportsSyncCodec`. Construction will
     raise `TypeError` if any codec does not.
     """
 
     codecs: tuple[Codec, ...]
-    array_spec: ArraySpec
 
-    # (sync codec, input_spec) pairs in pipeline order.
-    _aa_codecs: tuple[tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ArraySpec], ...] = field(
+    _aa_codecs: tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ...] = field(
         init=False, repr=False, compare=False
     )
     _ab_codec: SupportsSyncCodec[NDBuffer, Buffer] = field(init=False, repr=False, compare=False)
-    _ab_spec: ArraySpec = field(init=False, repr=False, compare=False)
     _bb_codecs: tuple[SupportsSyncCodec[Buffer, Buffer], ...] = field(
         init=False, repr=False, compare=False
     )
@@ -100,65 +594,96 @@ def __post_init__(self) -> None:
             )
 
         aa, ab, bb = codecs_from_list(list(self.codecs))
+        # SupportsSyncCodec was verified above; the cast is purely for mypy.
+        self._aa_codecs = cast("tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ...]", tuple(aa))
+        self._ab_codec = cast("SupportsSyncCodec[NDBuffer, Buffer]", ab)
+        self._bb_codecs = cast("tuple[SupportsSyncCodec[Buffer, Buffer], ...]", tuple(bb))
+
+    _cached_key: ArraySpec | None = field(init=False, repr=False, compare=False, default=None)
+    _cached_aa_specs: tuple[ArraySpec, ...] | None = field(
+        init=False, repr=False, compare=False, default=None
+    )
+    _cached_ab_spec: ArraySpec | None = field(init=False, repr=False, compare=False, default=None)
 
-        aa_codecs: list[tuple[SupportsSyncCodec[NDBuffer, NDBuffer], ArraySpec]] = []
-        spec = self.array_spec
-        for aa_codec in aa:
-            assert isinstance(aa_codec, SupportsSyncCodec)
-            aa_codecs.append((aa_codec, spec))
-            spec = aa_codec.resolve_metadata(spec)
+    def _resolve_specs(self, chunk_spec: ArraySpec) -> tuple[tuple[ArraySpec, ...], ArraySpec]:
+        """Return per-AA-codec input specs and the AB spec for `chunk_spec`.
 
-        self._aa_codecs = tuple(aa_codecs)
-        assert isinstance(ab, SupportsSyncCodec)
-        self._ab_codec = ab
-        self._ab_spec = spec
-        bb_sync: list[SupportsSyncCodec[Buffer, Buffer]] = []
-        for bb_codec in bb:
-            assert isinstance(bb_codec, SupportsSyncCodec)
-            bb_sync.append(bb_codec)
-        self._bb_codecs = tuple(bb_sync)
-
-    def decode(
-        self,
-        chunk_bytes: Buffer,
-    ) -> NDBuffer:
+        The resolved chain depends only on the value of `chunk_spec`, so we cache
+        it keyed on `chunk_spec` itself (ArraySpec is a frozen, hashable dataclass
+        — value identity). Keying on `id(chunk_spec)` would be unsafe: ids are
+        recycled after garbage collection, so a freed spec's id reused by a
+        different spec (same shape, different prototype/dtype/config) could yield
+        a stale hit. Value identity avoids that entirely.
+        """
+        if not self._aa_codecs:
+            return (), chunk_spec
+        key = chunk_spec
+        if self._cached_key == key:
+            assert self._cached_aa_specs is not None
+            assert self._cached_ab_spec is not None
+            return self._cached_aa_specs, self._cached_ab_spec
+
+        aa_specs_t, spec = resolve_aa_specs(cast("tuple[Codec, ...]", self._aa_codecs), chunk_spec)
+        self._cached_key = key
+        self._cached_aa_specs = aa_specs_t
+        self._cached_ab_spec = spec
+        return aa_specs_t, spec
+
+    def decode_chunk(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
         """Decode a single chunk through the full codec chain, synchronously.
 
         Pure compute -- no IO.
+
+        Parameters
+        ----------
+        chunk_bytes : Buffer
+            The encoded chunk bytes.
+        chunk_spec : ArraySpec
+            The array spec describing shape, dtype, fill value, and codec
+            configuration for this chunk.
         """
+        aa_specs, ab_spec = self._resolve_specs(chunk_spec)
+
         data: Buffer = chunk_bytes
         for bb_codec in reversed(self._bb_codecs):
-            data = bb_codec._decode_sync(data, self._ab_spec)
+            data = bb_codec._decode_sync(data, ab_spec)
 
-        chunk_array: NDBuffer = self._ab_codec._decode_sync(data, self._ab_spec)
+        chunk_array: NDBuffer = self._ab_codec._decode_sync(data, ab_spec)
 
-        for aa_codec, spec in reversed(self._aa_codecs):
-            chunk_array = aa_codec._decode_sync(chunk_array, spec)
+        for aa_codec, aa_spec in zip(reversed(self._aa_codecs), reversed(aa_specs), strict=True):
+            chunk_array = aa_codec._decode_sync(chunk_array, aa_spec)
 
         return chunk_array
 
-    def encode(
-        self,
-        chunk_array: NDBuffer,
-    ) -> Buffer | None:
+    def encode_chunk(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> Buffer | None:
         """Encode a single chunk through the full codec chain, synchronously.
 
         Pure compute -- no IO.
+
+        Parameters
+        ----------
+        chunk_array : NDBuffer
+            The chunk data to encode.
+        chunk_spec : ArraySpec
+            The array spec describing shape, dtype, fill value, and codec
+            configuration for this chunk.
         """
+        aa_specs, ab_spec = self._resolve_specs(chunk_spec)
+
         aa_data: NDBuffer = chunk_array
-        for aa_codec, spec in self._aa_codecs:
-            aa_result = aa_codec._encode_sync(aa_data, spec)
+        for aa_codec, aa_spec in zip(self._aa_codecs, aa_specs, strict=True):
+            aa_result = aa_codec._encode_sync(aa_data, aa_spec)
             if aa_result is None:
                 return None
             aa_data = aa_result
 
-        ab_result = self._ab_codec._encode_sync(aa_data, self._ab_spec)
+        ab_result = self._ab_codec._encode_sync(aa_data, ab_spec)
         if ab_result is None:
             return None
 
         bb_data: Buffer = ab_result
         for bb_codec in self._bb_codecs:
-            bb_result = bb_codec._encode_sync(bb_data, self._ab_spec)
+            bb_result = bb_codec._encode_sync(bb_data, ab_spec)
             if bb_result is None:
                 return None
             bb_data = bb_result
@@ -172,6 +697,76 @@ def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
         return byte_length
 
 
+@dataclass(slots=True, kw_only=True)
+class AsyncChunkTransform:
+    """A per-chunk asynchronous codec chain — the async mirror of ChunkTransform.
+
+    Decodes/encodes a SINGLE chunk through the full codec chain, awaiting each
+    codec's per-chunk async method (`_decode_single`/`_encode_single`) with the
+    correctly-evolved per-stage spec (via the shared `resolve_aa_specs`).
+
+    Unlike ChunkTransform it places no `SupportsSyncCodec` requirement on the
+    codecs, so it works for async-only codecs. Unlike the batched codec API it
+    operates on one chunk at a time — the mini-batch fan-out is a
+    BatchedCodecPipeline concern and deliberately not reintroduced here.
+    """
+
+    codecs: tuple[Codec, ...]
+
+    _aa_codecs: tuple[ArrayArrayCodec, ...] = field(init=False, repr=False, compare=False)
+    _ab_codec: ArrayBytesCodec = field(init=False, repr=False, compare=False)
+    _bb_codecs: tuple[BytesBytesCodec, ...] = field(init=False, repr=False, compare=False)
+
+    def __post_init__(self) -> None:
+        aa, ab, bb = codecs_from_list(list(self.codecs))
+        self._aa_codecs = aa
+        self._ab_codec = ab
+        self._bb_codecs = bb
+
+    async def decode_chunk(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> NDBuffer:
+        """Decode one chunk through the chain (bb -> ab -> aa), async."""
+        aa_specs, ab_spec = resolve_aa_specs(self._aa_codecs, chunk_spec)
+
+        data: Buffer = chunk_bytes
+        for bb_codec in reversed(self._bb_codecs):
+            data = await bb_codec._decode_single(data, ab_spec)
+
+        chunk_array: NDBuffer = await self._ab_codec._decode_single(data, ab_spec)
+
+        for aa_codec, aa_spec in zip(reversed(self._aa_codecs), reversed(aa_specs), strict=True):
+            chunk_array = await aa_codec._decode_single(chunk_array, aa_spec)
+
+        return chunk_array
+
+    async def encode_chunk(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> Buffer | None:
+        """Encode one chunk through the chain (aa -> ab -> bb), async.
+
+        Returns None if any stage drops the chunk (e.g. an all-fill chunk under
+        write_empty_chunks=False), matching ChunkTransform.encode_chunk.
+        """
+        aa_specs, ab_spec = resolve_aa_specs(self._aa_codecs, chunk_spec)
+
+        aa_data: NDBuffer = chunk_array
+        for aa_codec, aa_spec in zip(self._aa_codecs, aa_specs, strict=True):
+            aa_result = await aa_codec._encode_single(aa_data, aa_spec)
+            if aa_result is None:
+                return None
+            aa_data = aa_result
+
+        ab_result = await self._ab_codec._encode_single(aa_data, ab_spec)
+        if ab_result is None:
+            return None
+
+        bb_data: Buffer = ab_result
+        for bb_codec in self._bb_codecs:
+            bb_result = await bb_codec._encode_single(bb_data, ab_spec)
+            if bb_result is None:
+                return None
+            bb_data = bb_result
+
+        return bb_data
+
+
 @dataclass(frozen=True)
 class BatchedCodecPipeline(CodecPipeline):
     """Default codec pipeline.
@@ -187,18 +782,7 @@ class BatchedCodecPipeline(CodecPipeline):
     batch_size: int
 
     def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
-        # Each codec must be evolved against the spec it will actually see
-        # at run-time, not the original array spec. Earlier array->array
-        # codecs may transform the dtype (e.g. cast_value), so the spec
-        # threaded into later codecs (the array->bytes serializer and any
-        # bytes->bytes filters) must reflect those transformations.
-        evolved: list[Codec] = []
-        spec = array_spec
-        for codec in self:
-            evolved_codec = codec.evolve_from_array_spec(array_spec=spec)
-            evolved.append(evolved_codec)
-            spec = evolved_codec.resolve_metadata(spec)
-        return type(self).from_codecs(evolved)
+        return type(self).from_codecs(evolve_codecs(self, array_spec))
 
     @classmethod
     def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
@@ -213,34 +797,22 @@ def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None)
 
     @property
     def supports_partial_decode(self) -> bool:
-        """Determines whether the codec pipeline supports partial decoding.
-
-        Currently, only codec pipelines with a single ArrayBytesCodec that supports
-        partial decoding can support partial decoding. This limitation is due to the fact
-        that ArrayArrayCodecs can change the slice selection leading to non-contiguous
-        slices and BytesBytesCodecs can change the chunk bytes in a way that slice
-        selections cannot be attributed to byte ranges anymore which renders partial
-        decoding infeasible.
-
-        This limitation may softened in the future."""
-        return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance(
-            self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin
+        # Only a single ArrayBytesCodec that supports partial decoding, and no
+        # AA/BB codecs (they break the slice<->byte-range correspondence).
+        return pipeline_supports_partial_decode(
+            self.array_bytes_codec,
+            array_array_codecs=self.array_array_codecs,
+            bytes_bytes_codecs=self.bytes_bytes_codecs,
+            require_no_aa_bb=True,
         )
 
     @property
     def supports_partial_encode(self) -> bool:
-        """Determines whether the codec pipeline supports partial encoding.
-
-        Currently, only codec pipelines with a single ArrayBytesCodec that supports
-        partial encoding can support partial encoding. This limitation is due to the fact
-        that ArrayArrayCodecs can change the slice selection leading to non-contiguous
-        slices and BytesBytesCodecs can change the chunk bytes in a way that slice
-        selections cannot be attributed to byte ranges anymore which renders partial
-        encoding infeasible.
-
-        This limitation may softened in the future."""
-        return (len(self.array_array_codecs) + len(self.bytes_bytes_codecs)) == 0 and isinstance(
-            self.array_bytes_codec, ArrayBytesCodecPartialEncodeMixin
+        return pipeline_supports_partial_encode(
+            self.array_bytes_codec,
+            array_array_codecs=self.array_array_codecs,
+            bytes_bytes_codecs=self.bytes_bytes_codecs,
+            require_no_aa_bb=True,
         )
 
     def __iter__(self) -> Iterator[Codec]:
@@ -367,8 +939,8 @@ async def read_batch(
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
     ) -> tuple[GetResult, ...]:
-        results: list[GetResult] = []
         if self.supports_partial_decode:
+            results: list[GetResult] = []
             batch_info_list = list(batch_info)
             chunk_array_batch = await self.decode_partial_batch(
                 [
@@ -387,81 +959,8 @@ async def read_batch(
                 else:
                     out[out_selection] = fill_value_or_default(chunk_spec)
                     results.append(GetResult(status="missing"))
-        else:
-            batch_info_list = list(batch_info)
-            chunk_bytes_batch = await concurrent_map(
-                [
-                    (byte_getter, array_spec.prototype)
-                    for byte_getter, array_spec, *_ in batch_info_list
-                ],
-                lambda byte_getter, prototype: byte_getter.get(prototype),
-                config.get("async.concurrency"),
-            )
-            chunk_array_batch = await self.decode_batch(
-                [
-                    (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, *_) in zip(
-                        chunk_bytes_batch, batch_info_list, strict=False
-                    )
-                ],
-            )
-            for chunk_array, (_, chunk_spec, chunk_selection, out_selection, _) in zip(
-                chunk_array_batch, batch_info_list, strict=False
-            ):
-                if chunk_array is not None:
-                    tmp = chunk_array[chunk_selection]
-                    if drop_axes:
-                        tmp = tmp.squeeze(axis=drop_axes)
-                    out[out_selection] = tmp
-                    results.append(GetResult(status="present"))
-                else:
-                    out[out_selection] = fill_value_or_default(chunk_spec)
-                    results.append(GetResult(status="missing"))
-        return tuple(results)
-
-    def _merge_chunk_array(
-        self,
-        existing_chunk_array: NDBuffer | None,
-        value: NDBuffer,
-        out_selection: SelectorTuple,
-        chunk_spec: ArraySpec,
-        chunk_selection: SelectorTuple,
-        is_complete_chunk: bool,
-        drop_axes: tuple[int, ...],
-    ) -> NDBuffer:
-        if (
-            is_complete_chunk
-            and value.shape == chunk_spec.shape
-            # Guard that this is not a partial chunk at the end with is_complete_chunk=True
-            and value[out_selection].shape == chunk_spec.shape
-        ):
-            return value
-        if existing_chunk_array is None:
-            chunk_array = chunk_spec.prototype.nd_buffer.create(
-                shape=chunk_spec.shape,
-                dtype=chunk_spec.dtype.to_native_dtype(),
-                order=chunk_spec.order,
-                fill_value=fill_value_or_default(chunk_spec),
-            )
-        else:
-            chunk_array = existing_chunk_array.copy()  # make a writable copy
-        if chunk_selection == () or is_scalar(
-            value.as_ndarray_like(), chunk_spec.dtype.to_native_dtype()
-        ):
-            chunk_value = value
-        else:
-            chunk_value = value[out_selection]
-            # handle missing singleton dimensions
-            if drop_axes:
-                item = tuple(
-                    None  # equivalent to np.newaxis
-                    if idx in drop_axes
-                    else slice(None)
-                    for idx in range(chunk_spec.ndim)
-                )
-                chunk_value = chunk_value[item]
-        chunk_array[chunk_selection] = chunk_value
-        return chunk_array
+            return tuple(results)
+        return await _async_read_fallback(self, list(batch_info), out, drop_axes)
 
     async def write_batch(
         self,
@@ -486,93 +985,8 @@ async def write_batch(
                     ],
                 )
 
-        else:
-            # Read existing bytes if not total slice
-            async def _read_key(
-                byte_setter: ByteSetter | None, prototype: BufferPrototype
-            ) -> Buffer | None:
-                if byte_setter is None:
-                    return None
-                return await byte_setter.get(prototype=prototype)
-
-            chunk_bytes_batch: Iterable[Buffer | None]
-            chunk_bytes_batch = await concurrent_map(
-                [
-                    (
-                        None if is_complete_chunk else byte_setter,
-                        chunk_spec.prototype,
-                    )
-                    for byte_setter, chunk_spec, chunk_selection, _, is_complete_chunk in batch_info
-                ],
-                _read_key,
-                config.get("async.concurrency"),
-            )
-            chunk_array_decoded = await self.decode_batch(
-                [
-                    (chunk_bytes, chunk_spec)
-                    for chunk_bytes, (_, chunk_spec, *_) in zip(
-                        chunk_bytes_batch, batch_info, strict=False
-                    )
-                ],
-            )
-
-            chunk_array_merged = [
-                self._merge_chunk_array(
-                    chunk_array,
-                    value,
-                    out_selection,
-                    chunk_spec,
-                    chunk_selection,
-                    is_complete_chunk,
-                    drop_axes,
-                )
-                for chunk_array, (
-                    _,
-                    chunk_spec,
-                    chunk_selection,
-                    out_selection,
-                    is_complete_chunk,
-                ) in zip(chunk_array_decoded, batch_info, strict=False)
-            ]
-            chunk_array_batch: list[NDBuffer | None] = []
-            for chunk_array, (_, chunk_spec, *_) in zip(
-                chunk_array_merged, batch_info, strict=False
-            ):
-                if chunk_array is None:
-                    chunk_array_batch.append(None)  # type: ignore[unreachable]
-                else:
-                    if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
-                        fill_value_or_default(chunk_spec)
-                    ):
-                        chunk_array_batch.append(None)
-                    else:
-                        chunk_array_batch.append(chunk_array)
-
-            chunk_bytes_batch = await self.encode_batch(
-                [
-                    (chunk_array, chunk_spec)
-                    for chunk_array, (_, chunk_spec, *_) in zip(
-                        chunk_array_batch, batch_info, strict=False
-                    )
-                ],
-            )
-
-            async def _write_key(byte_setter: ByteSetter, chunk_bytes: Buffer | None) -> None:
-                if chunk_bytes is None:
-                    await byte_setter.delete()
-                else:
-                    await byte_setter.set(chunk_bytes)
-
-            await concurrent_map(
-                [
-                    (byte_setter, chunk_bytes)
-                    for chunk_bytes, (byte_setter, *_) in zip(
-                        chunk_bytes_batch, batch_info, strict=False
-                    )
-                ],
-                _write_key,
-                config.get("async.concurrency"),
-            )
+            return
+        await _async_write_fallback(self, list(batch_info), value, drop_axes)
 
     async def decode(
         self,
@@ -632,11 +1046,13 @@ def codecs_from_list(
 ) -> tuple[tuple[ArrayArrayCodec, ...], ArrayBytesCodec, tuple[BytesBytesCodec, ...]]:
     from zarr.codecs.sharding import ShardingCodec
 
+    codecs = tuple(codecs)  # materialize to avoid generator consumption issues
+
     array_array: tuple[ArrayArrayCodec, ...] = ()
     array_bytes_maybe: ArrayBytesCodec | None = None
     bytes_bytes: tuple[BytesBytesCodec, ...] = ()
 
-    if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(tuple(codecs)) > 1:
+    if any(isinstance(codec, ShardingCodec) for codec in codecs) and len(codecs) > 1:
         warn(
             "Combining a `sharding_indexed` codec disables partial reads and "
             "writes, which may lead to inefficient performance.",
@@ -690,3 +1106,466 @@ def codecs_from_list(
 
 
 register_pipeline(BatchedCodecPipeline)
+
+
+@dataclass(frozen=True)
+class FusedCodecPipeline(CodecPipeline):
+    """Codec pipeline that runs codec compute synchronously, in bulk.
+
+    This is an opt-in alternative to `BatchedCodecPipeline`. The win is NOT
+    "separating IO from compute" — the codecs (notably `ShardingCodec`) still
+    perform their own storage IO. The win is replacing the batched pipeline's
+    per-chunk *async scheduling* (≈one coroutine per chunk, which dominates real
+    codec work) with synchronous, batched/coalesced execution:
+
+    1. When every codec implements `SupportsSyncCodec`, a `ChunkTransform`
+       runs the codec chain synchronously (no event loop, no per-chunk coroutine)
+       — optionally across a thread pool for CPU-heavy decode/encode.
+    2. Sharded reads use the codec's synchronous IO methods: byte-range reads
+       coalesced via `Store.get_ranges_sync`, and a vectorized whole-shard bulk
+       decode for dense, fixed-size, uncompressed shards. Sharded writes go
+       through the codec's synchronous full-shard-rewrite path.
+    3. When the store lacks synchronous IO (e.g. ZipStore) the pipeline falls
+       back to the async path, equivalent to `BatchedCodecPipeline`.
+
+    IO ownership: the sharding codec holds the byte getter/setter and reads/
+    writes storage directly (the same model as zarrs; unlike tensorstore, which
+    keeps codecs storage-free). A storage-free codec is a possible future
+    direction (see the pure-codec design notes) but is explicitly NOT what this
+    pipeline does.
+    """
+
+    codecs: tuple[Codec, ...]
+    array_array_codecs: tuple[ArrayArrayCodec, ...]
+    array_bytes_codec: ArrayBytesCodec
+    bytes_bytes_codecs: tuple[BytesBytesCodec, ...]
+    sync_transform: ChunkTransform | None
+    batch_size: int
+
+    @classmethod
+    def from_codecs(cls, codecs: Iterable[Codec], *, batch_size: int | None = None) -> Self:
+        codec_list = tuple(codecs)
+        aa, ab, bb = codecs_from_list(codec_list)
+
+        if batch_size is None:
+            batch_size = config.get("codec_pipeline.batch_size")
+
+        return cls(
+            codecs=codec_list,
+            array_array_codecs=aa,
+            array_bytes_codec=ab,
+            bytes_bytes_codecs=bb,
+            sync_transform=None,
+            batch_size=batch_size,
+        )
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self:
+        evolved_codecs = evolve_codecs(self.codecs, array_spec)
+        aa, ab, bb = codecs_from_list(evolved_codecs)
+
+        try:
+            sync_transform: ChunkTransform | None = ChunkTransform(codecs=evolved_codecs)
+        except TypeError:
+            sync_transform = None
+
+        return type(self)(
+            codecs=evolved_codecs,
+            array_array_codecs=aa,
+            array_bytes_codec=ab,
+            bytes_bytes_codecs=bb,
+            sync_transform=sync_transform,
+            batch_size=self.batch_size,
+        )
+
+    def __iter__(self) -> Iterator[Codec]:
+        return iter(self.codecs)
+
+    @property
+    def supports_partial_decode(self) -> bool:
+        # NOTE: unlike BatchedCodecPipeline this does NOT require the AA/BB codec
+        # lists to be empty (require_no_aa_bb=False). That divergence is tracked
+        # separately; see pipeline_supports_partial_decode.
+        return pipeline_supports_partial_decode(
+            self.array_bytes_codec,
+            array_array_codecs=self.array_array_codecs,
+            bytes_bytes_codecs=self.bytes_bytes_codecs,
+            require_no_aa_bb=False,
+        )
+
+    @property
+    def supports_partial_encode(self) -> bool:
+        return pipeline_supports_partial_encode(
+            self.array_bytes_codec,
+            array_array_codecs=self.array_array_codecs,
+            bytes_bytes_codecs=self.bytes_bytes_codecs,
+            require_no_aa_bb=False,
+        )
+
+    def validate(
+        self,
+        *,
+        shape: tuple[int, ...],
+        dtype: ZDType[TBaseDType, TBaseScalar],
+        chunk_grid: ChunkGridMetadata,
+    ) -> None:
+        for codec in self.codecs:
+            codec.validate(shape=shape, dtype=dtype, chunk_grid=chunk_grid)
+
+    def compute_encoded_size(self, byte_length: int, array_spec: ArraySpec) -> int:
+        for codec in self:
+            byte_length = codec.compute_encoded_size(byte_length, array_spec)
+            array_spec = codec.resolve_metadata(array_spec)
+        return byte_length
+
+    # -- async decode/encode (required by ABC) and sync versions --
+
+    async def decode(
+        self,
+        chunk_bytes_and_specs: Iterable[tuple[Buffer | None, ArraySpec]],
+    ) -> Iterable[NDBuffer | None]:
+        # Decode each chunk through AsyncChunkTransform, which threads the
+        # per-stage spec correctly (via resolve_aa_specs). This is the single
+        # source of truth for async per-chunk decode; earlier this method
+        # reused one flat `chunk_specs` across every codec stage, which silently
+        # corrupted/crashed spec-changing codecs (transpose/cast/scale_offset)
+        # on the async fallback path.
+        async_transform = AsyncChunkTransform(codecs=self.codecs)
+        out: list[NDBuffer | None] = []
+        for chunk_bytes, chunk_spec in chunk_bytes_and_specs:
+            if chunk_bytes is None:
+                out.append(None)
+            else:
+                out.append(await async_transform.decode_chunk(chunk_bytes, chunk_spec))
+        return out
+
+    def decode_sync(
+        self,
+        chunk_bytes_and_specs: Sequence[tuple[Buffer | None, ArraySpec]],
+    ) -> Iterable[NDBuffer | None]:
+        """A sync version of `decode`.
+
+        Parameters
+        ----------
+        chunk_bytes_and_specs
+            The chunks and their descriptions (i.e., shape, fill value etc.)
+
+        Returns
+        -------
+            An iterator over decoded chunks
+
+        Raises
+        ------
+        RuntimeError
+            If there is no sync transform
+        """
+        transform = self.sync_transform
+        if transform is None:
+            raise RuntimeError("Do not call this method without a sync transform")
+        max_workers = _resolve_max_workers()
+
+        def _decode(item: tuple[Buffer | None, ArraySpec]) -> NDBuffer | None:
+            return None if item[0] is None else transform.decode_chunk(item[0], item[1])
+
+        if max_workers > 1 and len(chunk_bytes_and_specs) > 1:
+            pool = _get_pool(max_workers)
+            return list(
+                pool.map(
+                    _decode,
+                    chunk_bytes_and_specs,
+                )
+            )
+        else:
+            return [_decode(item) for item in chunk_bytes_and_specs]
+
+    async def encode(
+        self,
+        chunk_arrays_and_specs: Iterable[tuple[NDBuffer | None, ArraySpec]],
+    ) -> Iterable[Buffer | None]:
+        async_transform = AsyncChunkTransform(codecs=self.codecs)
+        out: list[Buffer | None] = []
+        for chunk_array, chunk_spec in chunk_arrays_and_specs:
+            if chunk_array is None:
+                out.append(None)
+            else:
+                out.append(await async_transform.encode_chunk(chunk_array, chunk_spec))
+        return out
+
+    def encode_sync(
+        self,
+        chunk_arrays_and_specs: Sequence[tuple[NDBuffer | None, ArraySpec]],
+    ) -> Iterable[Buffer | None]:
+        """A sync version of `encode`.
+
+        Parameters
+        ----------
+        chunk_arrays_and_specs
+            The chunks and their descriptions (i.e., shape, fill value etc.)
+
+        Returns
+        -------
+            An iterator over encoded chunks
+
+        Raises
+        ------
+        RuntimeError
+            If there is no sync transform
+        """
+        transform = self.sync_transform
+        if transform is None:
+            raise RuntimeError("Do not call this method without a sync transform")
+
+        max_workers = _resolve_max_workers()
+
+        def _encode(item: tuple[NDBuffer | None, ArraySpec]) -> Buffer | None:
+            return None if item[0] is None else transform.encode_chunk(item[0], item[1])
+
+        if max_workers > 1 and len(chunk_arrays_and_specs) > 1:
+            pool = _get_pool(max_workers)
+            return list(
+                pool.map(
+                    _encode,
+                    chunk_arrays_and_specs,
+                )
+            )
+        else:
+            return [_encode(item) for item in chunk_arrays_and_specs]
+
+    # -- sync read/write --
+
+    def read_sync(
+        self,
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        out: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+        max_workers: int = 1,
+    ) -> tuple[GetResult, ...]:
+        """Synchronous read: fetch -> decode -> scatter, per chunk.
+
+        When `max_workers > 1` and there are multiple chunks, each
+        chunk's full lifecycle (fetch + decode + scatter) runs as one
+        task on a thread pool sized to `max_workers` — overlapping IO
+        of one chunk with decode/scatter of another. Scatter is
+        thread-safe because the chunks have non-overlapping output
+        selections.
+
+        `max_workers=1` runs everything sequentially in the calling
+        thread (no pool involvement).
+
+        Mirrors `BatchedCodecPipeline.read_batch`: when the AB codec
+        supports partial decoding (e.g. sharding), the codec handles its
+        own IO and only fetches the inner-chunk byte ranges that overlap
+        the read selection. Otherwise the pipeline fetches the full
+        blob and decodes the whole chunk.
+        """
+        assert self.sync_transform is not None
+        transform = self.sync_transform
+
+        batch = list(batch_info)
+        if not batch:
+            return ()
+
+        # Partial-decode fast path: the AB codec owns IO (read only the
+        # byte ranges needed for the requested selection). Same condition
+        # and dispatch as BatchedCodecPipeline.read_batch.
+        if self.supports_partial_decode:
+            codec = self.array_bytes_codec
+            assert hasattr(codec, "_decode_partial_sync")
+
+            def _read_one(
+                item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+            ) -> GetResult:
+                byte_getter, chunk_spec, chunk_selection, out_selection, _ = item
+                # the partial decode returns the already-selected region
+                decoded = codec._decode_partial_sync(byte_getter, chunk_selection, chunk_spec)
+                return scatter_chunk(
+                    decoded,
+                    out,
+                    chunk_spec=chunk_spec,
+                    out_selection=out_selection,
+                    drop_axes=drop_axes,
+                )
+
+        else:
+            # Per-chunk fused path: fetch + decode + scatter as one task.
+            def _read_one(
+                item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+            ) -> GetResult:
+                byte_getter, chunk_spec, chunk_selection, out_selection, _ = item
+                raw = byte_getter.get_sync(prototype=chunk_spec.prototype)
+                return decode_and_scatter_chunk(
+                    raw,
+                    out,
+                    chunk_spec=chunk_spec,
+                    chunk_selection=chunk_selection,
+                    out_selection=out_selection,
+                    drop_axes=drop_axes,
+                    decode=transform.decode_chunk,
+                )
+
+        if max_workers > 1 and len(batch) > 1:
+            pool = _get_pool(max_workers)
+            return tuple(pool.map(_read_one, batch))
+        return tuple(_read_one(item) for item in batch)
+
+    def write_sync(
+        self,
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        value: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+        max_workers: int = 1,
+    ) -> None:
+        """Synchronous write: fetch existing -> merge+encode -> store.
+
+        When `max_workers > 1` and there are multiple chunks, each
+        chunk's full lifecycle (get-existing + merge + encode + set/delete)
+        runs as one task on a thread pool sized to `max_workers` —
+        overlapping IO of one chunk with compute of another.
+
+        `max_workers=1` runs everything sequentially in the calling
+        thread (no pool involvement).
+
+        When the codec pipeline supports partial encoding (e.g. a
+        sharding codec with no outer AA/BB codecs), the AB codec handles
+        the full write cycle — reading existing data, merging, encoding,
+        and writing — matching the async `BatchedCodecPipeline` path.
+        """
+        assert self.sync_transform is not None
+        transform = self.sync_transform
+
+        batch = list(batch_info)
+        if not batch:
+            return
+
+        # Partial-encode path: the AB codec owns IO (read, merge, encode,
+        # write).  Same condition and calling convention as
+        # BatchedCodecPipeline.write_batch.
+        if self.supports_partial_encode:
+            codec = self.array_bytes_codec
+            assert hasattr(codec, "_encode_partial_sync")
+            scalar = len(value.shape) == 0
+
+            def _write_one(
+                item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+            ) -> None:
+                bs, chunk_spec, chunk_selection, out_selection, _is_complete = item
+                chunk_value = value if scalar else value[out_selection]
+                codec._encode_partial_sync(bs, chunk_value, chunk_selection, chunk_spec)
+
+        else:
+            # Per-chunk fused path: get-existing + merge + encode + set/delete as one task.
+            def _write_one(
+                item: tuple[Any, ArraySpec, SelectorTuple, SelectorTuple, bool],
+            ) -> None:
+                bs, chunk_spec, chunk_selection, out_selection, is_complete = item
+                existing_bytes: Buffer | None = None
+                if not is_complete:
+                    existing_bytes = bs.get_sync(prototype=chunk_spec.prototype)
+
+                encoded = merge_and_encode_chunk(
+                    existing_bytes,
+                    value,
+                    chunk_spec=chunk_spec,
+                    chunk_selection=chunk_selection,
+                    out_selection=out_selection,
+                    is_complete=is_complete,
+                    drop_axes=drop_axes,
+                    decode=transform.decode_chunk,
+                    encode=transform.encode_chunk,
+                )
+                if encoded is None:
+                    bs.delete_sync()
+                else:
+                    bs.set_sync(encoded)
+
+        if max_workers > 1 and len(batch) > 1:
+            pool = _get_pool(max_workers)
+            list(pool.map(_write_one, batch))
+        else:
+            for item in batch:
+                _write_one(item)
+
+    # -- async read/write --
+
+    async def read(
+        self,
+        batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        out: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+    ) -> tuple[GetResult, ...]:
+        batch = list(batch_info)
+        if not batch:
+            return ()
+
+        # Fast path: sync transform plus synchronous IO. For StorePath the gate
+        # is on the STORE's sync support (StorePath always has a get_sync
+        # method, but it only works when its store does); for other byte
+        # getters (e.g. the sharding codec's in-memory _ShardingByteGetter) the
+        # SyncByteGetter protocol is the gate.
+        from zarr.abc.store import SupportsGetSync, SyncByteGetter
+        from zarr.storage._common import StorePath
+
+        first_bg = batch[0][0]
+        if self.sync_transform is not None and (
+            (isinstance(first_bg, StorePath) and isinstance(first_bg.store, SupportsGetSync))
+            or (not isinstance(first_bg, StorePath) and isinstance(first_bg, SyncByteGetter))
+        ):
+            return self.read_sync(batch, out, drop_axes, max_workers=_resolve_max_workers())
+
+        # Non-sync store (e.g. ZipStore): can't use the sync fast path. But if the
+        # array-bytes codec supports partial decoding (sharding), still route
+        # through the async partial-decode path — it fetches only the needed
+        # inner-chunk byte ranges (coalesced via get_ranges), matching
+        # BatchedCodecPipeline. Without this, the whole-shard _async_read_fallback
+        # below would over-read and diverge from the batched pipeline's IO.
+        if self.supports_partial_decode:
+            assert isinstance(self.array_bytes_codec, ArrayBytesCodecPartialDecodeMixin)
+            chunk_array_batch = await self.array_bytes_codec.decode_partial(
+                [
+                    (byte_getter, chunk_selection, chunk_spec)
+                    for byte_getter, chunk_spec, chunk_selection, *_ in batch
+                ]
+            )
+            results: list[GetResult] = []
+            for chunk_array, (_, chunk_spec, _, out_selection, _) in zip(
+                chunk_array_batch, batch, strict=False
+            ):
+                if chunk_array is not None:
+                    if drop_axes:
+                        chunk_array = chunk_array.squeeze(axis=drop_axes)
+                    out[out_selection] = chunk_array
+                    results.append(GetResult(status="present"))
+                else:
+                    out[out_selection] = fill_value_or_default(chunk_spec)
+                    results.append(GetResult(status="missing"))
+            return tuple(results)
+
+        return await _async_read_fallback(self, batch, out, drop_axes)
+
+    async def write(
+        self,
+        batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
+        value: NDBuffer,
+        drop_axes: tuple[int, ...] = (),
+    ) -> None:
+        batch = list(batch_info)
+        if not batch:
+            return
+
+        # Fast path: sync transform plus synchronous IO. Mirrors `read`: gate
+        # StorePath on the store's sync support, other byte setters (e.g. the
+        # sharding codec's in-memory _ShardingByteSetter) on SyncByteSetter.
+        from zarr.abc.store import SupportsSetSync, SyncByteSetter
+        from zarr.storage._common import StorePath
+
+        first_bs = batch[0][0]
+        if self.sync_transform is not None and (
+            (isinstance(first_bs, StorePath) and isinstance(first_bs.store, SupportsSetSync))
+            or (not isinstance(first_bs, StorePath) and isinstance(first_bs, SyncByteSetter))
+        ):
+            self.write_sync(batch, value, drop_axes, max_workers=_resolve_max_workers())
+            return
+
+        await _async_write_fallback(self, batch, value, drop_axes)
+
+
+register_pipeline(FusedCodecPipeline)
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 7dcbc78e31..746f22ff66 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -104,8 +104,12 @@ def enable_gpu(self) -> ConfigSet:
             "threading": {"max_workers": None},
             "json_indent": 2,
             "codec_pipeline": {
-                "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
+                "path": "zarr.core.codec_pipeline.FusedCodecPipeline",
                 "batch_size": 1,
+                # Default to sequential (no thread pool). Threading-by-default is a
+                # separate, larger change (downstream thread-safety, oversubscription
+                # on many-core nodes); opt in with codec_pipeline.max_workers > 1.
+                "max_workers": 1,
             },
             "codecs": {
                 "blosc": "zarr.codecs.blosc.BloscCodec",
diff --git a/src/zarr/storage/_fsspec.py b/src/zarr/storage/_fsspec.py
index 29201a6fee..90e809cd60 100644
--- a/src/zarr/storage/_fsspec.py
+++ b/src/zarr/storage/_fsspec.py
@@ -42,13 +42,13 @@ async def _close_fs(fs: AsyncFileSystem) -> None:
     """
     Best-effort async close of an fsspec async filesystem owned by FsspecStore.
 
-    For filesystems that expose ``set_session()`` (e.g. s3fs) the underlying
-    aiohttp ``ClientSession`` is closed explicitly, which prevents
-    "Unclosed client session" ``ResourceWarning``s from aiohttp.  For all
+    For filesystems that expose `set_session()` (e.g. s3fs) the underlying
+    aiohttp `ClientSession` is closed explicitly, which prevents
+    "Unclosed client session" `ResourceWarning`s from aiohttp.  For all
     other filesystem types the call is a no-op (not every implementation
     manages an HTTP session directly).
 
-    Note that ``set_session()`` lazily creates a session if none exists yet, so
+    Note that `set_session()` lazily creates a session if none exists yet, so
     closing a store that never performed any I/O may instantiate a session
     purely to close it.  This is accepted best-effort behavior; fsspec does not
     expose a stable, cross-implementation way to test for an existing session.
@@ -286,7 +286,7 @@ def with_read_only(self, read_only: bool = False) -> FsspecStore:
         )
         # The derived store shares the same fs. Transfer ownership so the
         # surviving store closes it, and clear ours to avoid a double-close.
-        # Otherwise the common ``from_url(...).with_read_only()`` chain would
+        # Otherwise the common `from_url(...).with_read_only()` chain would
         # drop the only owner (the unreferenced source) and leak the session.
         new_store._owns_fs = self._owns_fs
         self._owns_fs = False
diff --git a/src/zarr/testing/buffer.py b/src/zarr/testing/buffer.py
index 6096ece2f8..f666801694 100644
--- a/src/zarr/testing/buffer.py
+++ b/src/zarr/testing/buffer.py
@@ -13,6 +13,8 @@
     from collections.abc import Iterable
     from typing import Self
 
+    from zarr.abc.store import ByteRequest
+
 
 __all__ = [
     "NDBufferUsingTestNDArrayLike",
@@ -72,6 +74,13 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None
             assert isinstance(value, TestBuffer)
         await super().set(key, value, byte_range)
 
+    def set_sync(self, key: str, value: Buffer) -> None:
+        # Synchronous counterpart of `set`, used by FusedCodecPipeline. Mirror the
+        # same buffer-type guard so the invariant holds whichever pipeline writes.
+        if "json" not in key:
+            assert isinstance(value, TestBuffer)
+        super().set_sync(key, value)
+
     async def get(
         self,
         key: str,
@@ -84,3 +93,18 @@ async def get(
         if ret is not None:
             assert isinstance(ret, prototype.buffer)
         return ret
+
+    def get_sync(
+        self,
+        key: str,
+        *,
+        prototype: BufferPrototype | None = None,
+        byte_range: ByteRequest | None = None,
+    ) -> Buffer | None:
+        # Synchronous counterpart of `get`, used by FusedCodecPipeline.
+        if "json" not in key and prototype is not None:
+            assert prototype.buffer is TestBuffer
+        ret = super().get_sync(key=key, prototype=prototype, byte_range=byte_range)
+        if ret is not None and prototype is not None:
+            assert isinstance(ret, prototype.buffer)
+        return ret
diff --git a/tests/benchmarks/test_e2e.py b/tests/benchmarks/test_e2e.py
index 65d0e65ac9..0e5e88b72c 100644
--- a/tests/benchmarks/test_e2e.py
+++ b/tests/benchmarks/test_e2e.py
@@ -9,16 +9,21 @@
 from tests.benchmarks.common import Layout
 
 if TYPE_CHECKING:
+    from collections.abc import Iterator
+
     from pytest_benchmark.fixture import BenchmarkFixture
 
     from zarr.abc.store import Store
     from zarr.core.common import NamedConfig
+
 from operator import getitem, setitem
 from typing import Any, Literal
 
 import pytest
 
 from zarr import create_array
+from zarr.core.config import config as zarr_config
+from zarr.testing.store import LatencyStore
 
 CompressorName = Literal["gzip"] | None
 
@@ -37,18 +42,63 @@
     Layout(shape=(1_000_000,), chunks=(100,), shards=(10000 * 100,)),
 )
 
+_PIPELINE_PATHS = {
+    "batched": "zarr.core.codec_pipeline.BatchedCodecPipeline",
+    "fused": "zarr.core.codec_pipeline.FusedCodecPipeline",
+}
+
+_LATENCY_VALUES = (0.0, 0.001, 0.05, 0.2)
+
+
+@pytest.fixture(params=_LATENCY_VALUES, ids=lambda v: f"latency={v}")
+def latency(request: pytest.FixtureRequest) -> float:
+    return request.param  # type: ignore[no-any-return]
+
+
+@pytest.fixture
+def bench_store(store: Store, latency: float, request: pytest.FixtureRequest) -> Store:
+    """Wraps the underlying store in LatencyStore when latency > 0.
+
+    Local-store cases skip nonzero latency — synthetic latency on top of
+    a real LocalStore is double-counting; latency simulation only applies
+    to the in-process memory store.
+    """
+    callspec = getattr(request.node, "callspec", None)
+    store_kind = callspec.params.get("store", "memory") if callspec is not None else "memory"
+    if latency > 0:
+        if store_kind == "local":
+            pytest.skip("latency injection only applies to in-memory store")
+        return LatencyStore(store, get_latency=latency, set_latency=latency)
+    return store
+
+
+@pytest.fixture(params=["batched", "fused"])
+def pipeline(request: pytest.FixtureRequest) -> Iterator[str]:
+    """Set ``codec_pipeline.path`` for the duration of the benchmark.
+
+    Yields the pipeline name so each parametrize cell has a distinct
+    benchmark id.
+    """
+    name = request.param
+    with zarr_config.set({"codec_pipeline.path": _PIPELINE_PATHS[name]}):
+        yield name
+
 
 @pytest.mark.parametrize("compression_name", [None, "gzip"])
 @pytest.mark.parametrize("layout", layouts, ids=str)
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
 def test_write_array(
-    store: Store, layout: Layout, compression_name: CompressorName, benchmark: BenchmarkFixture
+    bench_store: Store,
+    layout: Layout,
+    compression_name: CompressorName,
+    pipeline: str,
+    benchmark: BenchmarkFixture,
 ) -> None:
     """
     Test the time required to fill an array with a single value
     """
     arr = create_array(
-        store,
+        bench_store,
         dtype="uint8",
         shape=layout.shape,
         chunks=layout.chunks,
@@ -64,13 +114,17 @@ def test_write_array(
 @pytest.mark.parametrize("layout", layouts, ids=str)
 @pytest.mark.parametrize("store", ["memory", "local"], indirect=["store"])
 def test_read_array(
-    store: Store, layout: Layout, compression_name: CompressorName, benchmark: BenchmarkFixture
+    bench_store: Store,
+    layout: Layout,
+    compression_name: CompressorName,
+    pipeline: str,
+    benchmark: BenchmarkFixture,
 ) -> None:
     """
     Test the time required to fill an array with a single value
     """
     arr = create_array(
-        store,
+        bench_store,
         dtype="uint8",
         shape=layout.shape,
         chunks=layout.chunks,
diff --git a/tests/test_sync_codec_pipeline.py b/tests/test_chunk_transform.py
similarity index 83%
rename from tests/test_sync_codec_pipeline.py
rename to tests/test_chunk_transform.py
index 1bfde7c837..7be9d1ce9c 100644
--- a/tests/test_sync_codec_pipeline.py
+++ b/tests/test_chunk_transform.py
@@ -1,3 +1,13 @@
+"""Unit tests for ChunkTransform -- the per-chunk synchronous codec chain.
+
+ChunkTransform is the data structure FusedCodecPipeline uses to encode/decode a
+single chunk through a sequence of codecs synchronously. These tests exercise it
+directly (no pipeline, no store): construction and its rejection of codecs that
+lack a synchronous implementation, encode/decode roundtrips across codec chains,
+compute_encoded_size, and None short-circuiting when an array->array codec
+returns None. End-to-end pipeline behavior lives in the pipeline test modules.
+"""
+
 from __future__ import annotations
 
 from typing import Any
@@ -58,8 +68,8 @@ def _make_nd_buffer(arr: np.ndarray[Any, np.dtype[Any]]) -> NDBuffer:
 )
 def test_construction(shape: tuple[int, ...], codecs: tuple[Codec, ...]) -> None:
     """Construction succeeds when all codecs implement SupportsSyncCodec."""
-    spec = _make_array_spec(shape, np.dtype("float64"))
-    ChunkTransform(codecs=codecs, array_spec=spec)
+    _ = _make_array_spec(shape, np.dtype("float64"))
+    ChunkTransform(codecs=codecs)
 
 
 @pytest.mark.parametrize(
@@ -72,9 +82,9 @@ def test_construction(shape: tuple[int, ...], codecs: tuple[Codec, ...]) -> None
 )
 def test_construction_rejects_non_sync(shape: tuple[int, ...], codecs: tuple[Codec, ...]) -> None:
     """Construction raises TypeError when any codec lacks SupportsSyncCodec."""
-    spec = _make_array_spec(shape, np.dtype("float64"))
+    _ = _make_array_spec(shape, np.dtype("float64"))
     with pytest.raises(TypeError, match="AsyncOnlyCodec"):
-        ChunkTransform(codecs=codecs, array_spec=spec)
+        ChunkTransform(codecs=codecs)
 
 
 @pytest.mark.parametrize(
@@ -96,12 +106,12 @@ def test_encode_decode_roundtrip(
 ) -> None:
     """Data survives a full encode/decode cycle."""
     spec = _make_array_spec(arr.shape, arr.dtype)
-    chain = ChunkTransform(codecs=codecs, array_spec=spec)
+    chain = ChunkTransform(codecs=codecs)
     nd_buf = _make_nd_buffer(arr)
 
-    encoded = chain.encode(nd_buf)
+    encoded = chain.encode_chunk(nd_buf, spec)
     assert encoded is not None
-    decoded = chain.decode(encoded)
+    decoded = chain.decode_chunk(encoded, spec)
     np.testing.assert_array_equal(arr, decoded.as_numpy_array())
 
 
@@ -122,7 +132,7 @@ def test_compute_encoded_size(
 ) -> None:
     """compute_encoded_size returns the correct byte length."""
     spec = _make_array_spec(shape, np.dtype("float64"))
-    chain = ChunkTransform(codecs=codecs, array_spec=spec)
+    chain = ChunkTransform(codecs=codecs)
     assert chain.compute_encoded_size(input_size, spec) == expected_size
 
 
@@ -138,8 +148,7 @@ def _encode_sync(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> NDBuffer
     spec = _make_array_spec((3, 4), np.dtype("float64"))
     chain = ChunkTransform(
         codecs=(NoneReturningAACodec(order=(1, 0)), BytesCodec()),
-        array_spec=spec,
     )
     arr = np.arange(12, dtype="float64").reshape(3, 4)
     nd_buf = _make_nd_buffer(arr)
-    assert chain.encode(nd_buf) is None
+    assert chain.encode_chunk(nd_buf, spec) is None
diff --git a/tests/test_codec_pipeline.py b/tests/test_codec_pipeline.py
index fa41c2867b..656969b3a3 100644
--- a/tests/test_codec_pipeline.py
+++ b/tests/test_codec_pipeline.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING, Any
+
 import numpy as np
 import pytest
 
@@ -7,29 +9,55 @@
 from zarr.codecs import BytesCodec, CastValue
 from zarr.core.array import _get_chunk_spec
 from zarr.core.buffer.core import default_buffer_prototype
+from zarr.core.config import config as zarr_config
 from zarr.core.indexing import BasicIndexer
 from zarr.storage import MemoryStore
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@pytest.fixture(autouse=True)
+def _enable_rectilinear_chunks() -> Generator[None]:
+    """Enable rectilinear chunks for all tests in this module."""
+    with zarr_config.set({"array.rectilinear_chunks": True}):
+        yield
+
+
+pipeline_paths = [
+    "zarr.core.codec_pipeline.BatchedCodecPipeline",
+    "zarr.core.codec_pipeline.FusedCodecPipeline",
+]
+
+
+@pytest.fixture(params=pipeline_paths, ids=["batched", "sync"])
+def pipeline_class(request: pytest.FixtureRequest) -> Generator[str]:
+    """Temporarily set the codec pipeline class for the test."""
+    path = request.param
+    with zarr_config.set({"codec_pipeline.path": path}):
+        yield path
+
+
+# ---------------------------------------------------------------------------
+# GetResult status tests (low-level pipeline API)
+# ---------------------------------------------------------------------------
+
 
 @pytest.mark.parametrize(
     ("write_slice", "read_slice", "expected_statuses"),
     [
-        # Write all chunks, read all — all present
         (slice(None), slice(None), ("present", "present", "present")),
-        # Write first chunk only, read all — first present, rest missing
         (slice(0, 2), slice(None), ("present", "missing", "missing")),
-        # Write nothing, read all — all missing
         (None, slice(None), ("missing", "missing", "missing")),
     ],
 )
 async def test_read_returns_get_results(
+    pipeline_class: str,
     write_slice: slice | None,
     read_slice: slice,
     expected_statuses: tuple[str, ...],
 ) -> None:
-    """
-    Test that CodecPipeline.read returns a tuple of GetResult with correct statuses.
-    """
+    """CodecPipeline.read returns GetResult with correct statuses."""
     store = MemoryStore()
     arr = zarr.open_array(store, mode="w", shape=(6,), chunks=(2,), dtype="int64", fill_value=-1)
 
@@ -74,6 +102,34 @@ async def test_read_returns_get_results(
         assert result["status"] == expected_status
 
 
+# ---------------------------------------------------------------------------
+# write_empty_chunks / read_missing_chunks config tests
+# ---------------------------------------------------------------------------
+
+
+async def test_write_empty_chunks_false_no_store(pipeline_class: str) -> None:
+    """With write_empty_chunks=False, fill_value-only chunks should not be stored."""
+    store: dict[str, Any] = {}
+    arr = zarr.create_array(
+        store=store,
+        shape=(20,),
+        dtype="float64",
+        chunks=(10,),
+        shards=None,
+        compressors=None,
+        fill_value=0.0,
+        config={"write_empty_chunks": False},
+    )
+    arr[:] = 0.0  # all fill_value
+
+    # Chunks should NOT be persisted
+    assert "c/0" not in store
+    assert "c/1" not in store
+
+    # But reading should still return fill values
+    np.testing.assert_array_equal(arr[:], np.zeros(20, dtype="float64"))
+
+
 try:
     import cast_value_rs  # noqa: F401
 
@@ -120,3 +176,78 @@ def test_codec_pipeline_threads_dtype_through_evolve(source_dtype: str, target_d
     )
     arr[:] = np.asarray([0, 1, 2, 3], dtype=source_dtype)
     np.testing.assert_array_equal(arr[:], np.asarray([0, 1, 2, 3], dtype=source_dtype))
+
+
+def test_evolve_threads_spec_preserving_serializer_endian(pipeline_class: str) -> None:
+    """Regression for #3937, dependency-free variant.
+
+    `evolve_from_array_spec` must thread the spec FORWARD through the codec chain:
+    each codec is evolved against the spec produced by the previous one, not the
+    original array spec. An array->array codec that widens the dtype from a
+    single-byte type (no endianness) to a multi-byte type means the BytesCodec
+    serializer must be evolved against the *widened* dtype — otherwise it sees
+    the single-byte source, strips its `endian` to None, and later fails to
+    decode the multi-byte data.
+
+    The original regression test for this needs `cast_value_rs` (so it only runs
+    in the optional-deps CI job). This variant uses a minimal dtype-widening AA
+    codec stub, so it runs everywhere and on both pipelines via `pipeline_class`.
+    """
+    from dataclasses import dataclass
+
+    from zarr.abc.codec import ArrayArrayCodec
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.dtype import get_data_type_from_native_dtype
+    from zarr.registry import get_pipeline_class
+
+    @dataclass(frozen=True)
+    class _WidenToInt16(ArrayArrayCodec):
+        """Test-only AA codec: reports the encoded dtype as int16 (no real encode)."""
+
+        is_fixed_size = True
+
+        def to_dict(self) -> dict[str, Any]:
+            return {"name": "_widen_to_int16"}
+
+        @classmethod
+        def from_dict(cls, data: dict[str, Any]) -> _WidenToInt16:
+            return cls()
+
+        def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
+            from dataclasses import replace
+
+            return replace(chunk_spec, dtype=get_data_type_from_native_dtype(np.dtype("int16")))
+
+        def compute_encoded_size(self, input_byte_length: int, _spec: ArraySpec) -> int:
+            return input_byte_length
+
+        async def _decode_single(self, chunk_array: Any, chunk_spec: ArraySpec) -> Any:
+            return chunk_array  # pragma: no cover
+
+        async def _encode_single(self, chunk_array: Any, chunk_spec: ArraySpec) -> Any:
+            return chunk_array  # pragma: no cover
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("int8"))  # single-byte source
+    spec = ArraySpec(
+        shape=(4,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=False),
+        prototype=default_buffer_prototype(),
+    )
+
+    from zarr.core.codec_pipeline import BatchedCodecPipeline, FusedCodecPipeline
+
+    pipeline = get_pipeline_class().from_codecs((_WidenToInt16(), BytesCodec(endian="little")))
+    evolved = pipeline.evolve_from_array_spec(spec)
+    # Both concrete pipelines expose `array_bytes_codec`; narrow off the ABC.
+    assert isinstance(evolved, (BatchedCodecPipeline, FusedCodecPipeline))
+    serializer = evolved.array_bytes_codec
+
+    # The serializer must keep its little-endian setting: it is evolved against
+    # the widened (int16) dtype, not the single-byte source.
+    assert isinstance(serializer, BytesCodec)
+    assert serializer.endian is not None, (
+        "BytesCodec serializer lost its `endian` — evolve_from_array_spec did not "
+        "thread the dtype-widening AA codec's spec into the serializer"
+    )
diff --git a/tests/test_codec_pipeline_suite.py b/tests/test_codec_pipeline_suite.py
new file mode 100644
index 0000000000..07e1aa2ec4
--- /dev/null
+++ b/tests/test_codec_pipeline_suite.py
@@ -0,0 +1,582 @@
+"""Shared codec-pipeline behavior suite, run against EVERY codec pipeline.
+
+The defining property of a codec pipeline is that the array semantics it
+produces must be identical no matter which pipeline is configured. To make
+"one pipeline diverges from the others" structurally hard to ship, every
+pipeline-agnostic behavior test lives as a method on ``CodecPipelineTests`` and
+is instantiated once per pipeline (``TestBatchedPipeline`` / ``TestFusedPipeline``).
+
+Each test also runs over a *store axis* that exercises both code paths the
+synchronous pipelines branch on:
+
+* ``sync``  -> ``MemoryStore`` (supports ``get_sync``/``set_sync``: fast path)
+* ``async`` -> ``LatencyStore(MemoryStore())`` (NOT sync-capable: async fallback)
+
+The async axis is deliberate: a regression that only affects the async fallback
+of the default pipeline (e.g. a codec-spec-evolution bug that surfaces only on
+remote stores) is invisible if every test runs on MemoryStore. Running the same
+battery over a non-sync store closes that gap.
+
+Pipeline-specific tests (construction, ``from_codecs``, the byte-range write
+fast path, etc.) stay in their own modules; only behavior that ALL pipelines
+must share belongs here.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any
+
+import numcodecs
+import numpy as np
+import pytest
+
+import zarr
+from zarr.codecs import BytesCodec, GzipCodec, ShardingCodec, TransposeCodec
+from zarr.core.config import config as zarr_config
+from zarr.errors import ChunkNotFoundError
+from zarr.storage import MemoryStore
+from zarr.testing.store import LatencyStore
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from zarr.abc.store import Store
+    from zarr.codecs.sharding import SubchunkWriteOrder
+
+
+# --- store axis: a sync store and a non-sync (async-fallback) store ----------
+
+STORE_KINDS = ["sync", "async"]
+
+
+def _make_store(kind: str) -> Store:
+    if kind == "sync":
+        # MemoryStore supports get_sync/set_sync -> synchronous fast path.
+        return MemoryStore()
+    if kind == "async":
+        # LatencyStore is NOT SupportsGetSync/SupportsSetSync, so a synchronous
+        # pipeline must fall back to its async path. Zero latency keeps it fast.
+        return LatencyStore(MemoryStore(), get_latency=0.0, set_latency=0.0)
+    raise AssertionError(kind)
+
+
+# --- scenario model ----------------------------------------------------------
+#
+# Most pipeline behavior tests have one shape:
+#   create an array, apply some writes, (optionally) assert which chunk keys
+#   exist, then assert reads come back correct. A Scenario captures exactly those
+#   variables so one parametrized test covers them all. Correctness is checked
+#   against a numpy reference array that the scenario mutates in lock-step with
+#   the zarr array, so cases don't hand-maintain expected values.
+
+
+@dataclass(frozen=True)
+class Scenario:
+    id: str
+    array_kwargs: dict[str, Any]
+    # (selection, value) writes applied in order. value may be a scalar or array.
+    writes: tuple[tuple[Any, Any], ...] = ()
+    # selections to read back and check against the reference. () means "read all".
+    reads: tuple[Any, ...] = (slice(None),)
+    # substrings of chunk keys that must be present / absent after the writes.
+    # Only checked on the sync store (key layout is identical across stores, but
+    # we keep it to one axis to avoid asserting store internals twice).
+    keys_present: tuple[str, ...] = ()
+    keys_absent: tuple[str, ...] = ()
+
+    def reference(self) -> np.ndarray:
+        """The numpy array the scenario's writes should produce, starting from
+        the array's fill value."""
+        kw = self.array_kwargs
+        shape = kw["shape"]
+        dtype = np.dtype(kw["dtype"])
+        fill = kw.get("fill_value", 0)
+        ref = np.full(shape, fill, dtype=dtype)
+        for sel, value in self.writes:
+            ref[sel] = value
+        return ref
+
+
+def _val(n: int, dtype: str, offset: int = 1) -> np.ndarray:
+    return np.arange(offset, offset + n, dtype=dtype)
+
+
+# Common dtype/chunk presets reused below.
+_F64 = {"dtype": "float64", "fill_value": 0.0}
+_I32 = {"dtype": "int32", "fill_value": -1}
+
+SCENARIOS: tuple[Scenario, ...] = (
+    # --- full-array roundtrips across layouts/codecs ------------------------
+    Scenario(
+        "1d-unsharded-roundtrip",
+        {"shape": (100,), "chunks": (10,), "shards": None, "compressors": None, **_F64},
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    Scenario(
+        "1d-sharded-roundtrip",
+        {"shape": (100,), "chunks": (10,), "shards": (100,), "compressors": None, **_F64},
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    Scenario(
+        "1d-multi-chunk-shard-roundtrip",
+        {"shape": (100,), "chunks": (10,), "shards": (50,), "compressors": None, **_F64},
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    Scenario(
+        "2d-unsharded-roundtrip",
+        {"shape": (10, 20), "chunks": (5, 10), "shards": None, "compressors": None, **_I32},
+        writes=((slice(None), np.arange(200, dtype="int32").reshape(10, 20)),),
+    ),
+    Scenario(
+        "2d-sharded-roundtrip",
+        {"shape": (20, 20), "chunks": (5, 5), "shards": (10, 10), "compressors": None, **_I32},
+        writes=((slice(None), np.arange(400, dtype="int32").reshape(20, 20)),),
+    ),
+    Scenario(
+        "1d-gzip-roundtrip",
+        {
+            "shape": (100,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": {"name": "gzip", "configuration": {"level": 1}},
+            **_F64,
+        },
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    Scenario(
+        "1d-zstd-roundtrip",
+        {
+            "shape": (100,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": {"name": "zstd", "configuration": {"level": 1}},
+            **_F64,
+        },
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    Scenario(
+        "1d-float32-roundtrip",
+        {
+            "shape": (50,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": None,
+            "dtype": "float32",
+            "fill_value": 0.0,
+        },
+        writes=((slice(None), _val(50, "float32")),),
+    ),
+    # zarr v2 goes through the V2Codec wrapper (filters + compressor), a
+    # different codec path than the v3 AA/AB/BB chain — and a different sync
+    # implementation under FusedCodecPipeline. Without these scenarios, v2 was
+    # only exercised implicitly via whichever pipeline is the global default.
+    Scenario(
+        "v2-roundtrip",
+        {
+            "shape": (100,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": None,
+            "zarr_format": 2,
+            **_F64,
+        },
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    Scenario(
+        "v2-gzip-roundtrip",
+        {
+            "shape": (100,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": numcodecs.GZip(level=1),
+            "zarr_format": 2,
+            **_F64,
+        },
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    # v2 filters are the other half of the V2Codec wrapper (numcodecs
+    # array->array filters, a distinct branch from the compressor in
+    # _encode_sync/_decode_sync).
+    Scenario(
+        "v2-filter-gzip-roundtrip",
+        {
+            "shape": (100,),
+            "chunks": (10,),
+            "shards": None,
+            "filters": numcodecs.Delta(dtype="float64"),
+            "compressors": numcodecs.GZip(level=1),
+            "zarr_format": 2,
+            **_F64,
+        },
+        writes=((slice(None), _val(100, "float64")),),
+    ),
+    # --- read unwritten chunks -> fill value --------------------------------
+    Scenario(
+        "missing-chunks-fill",
+        {
+            "shape": (100,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": None,
+            "dtype": "float64",
+            "fill_value": -7.0,
+        },
+        writes=(),
+    ),
+    Scenario(
+        "missing-chunks-fill-sharded",
+        {
+            "shape": (100,),
+            "chunks": (10,),
+            "shards": (100,),
+            "compressors": None,
+            "dtype": "float64",
+            "fill_value": -7.0,
+        },
+        writes=(),
+    ),
+    # --- partial write, varied read selections ------------------------------
+    Scenario(
+        "partial-write-full-read",
+        {"shape": (100,), "chunks": (10,), "shards": None, "compressors": None, **_F64},
+        writes=((slice(5, 15), _val(10, "float64")),),
+        reads=(slice(None),),
+    ),
+    Scenario(
+        "full-write-strided-read",
+        {"shape": (100,), "chunks": (10,), "shards": None, "compressors": None, **_F64},
+        writes=((slice(None), _val(100, "float64")),),
+        reads=(np.s_[::3], np.s_[10:20]),
+    ),
+    Scenario(
+        "partial-write-partial-read-sharded",
+        {"shape": (100,), "chunks": (10,), "shards": (100,), "compressors": None, **_F64},
+        writes=((slice(20, 70), _val(50, "float64")),),
+        reads=(np.s_[30:60], slice(None)),
+    ),
+    # scalar single-element reads from a sharded array hit the sharding codec's
+    # partial-decode path (_decode_partial_single), distinct from slice reads.
+    Scenario(
+        "sharded-scalar-reads-1d",
+        {"shape": (100,), "chunks": (10,), "shards": (50,), "compressors": None, **_F64},
+        writes=((slice(None), _val(100, "float64")),),
+        reads=(np.s_[0], np.s_[50], np.s_[99], np.s_[::3]),
+    ),
+    Scenario(
+        "sharded-scalar-reads-2d",
+        {"shape": (20, 20), "chunks": (5, 5), "shards": (10, 10), "compressors": None, **_I32},
+        writes=((slice(None), np.arange(400, dtype="int32").reshape(20, 20)),),
+        reads=(np.s_[0, 0], np.s_[10, 10], np.s_[19, 19]),
+    ),
+    # --- spec-changing codec (transpose): the async-spec-evolution guard ----
+    Scenario(
+        "transpose",
+        {
+            "shape": (8, 12),
+            "chunks": (2, 4),
+            "shards": None,
+            "filters": [TransposeCodec(order=(1, 0))],
+            "serializer": BytesCodec(),
+            **_I32,
+        },
+        writes=((slice(None), np.arange(96, dtype="int32").reshape(8, 12)),),
+        reads=(slice(None), np.s_[1:7, 2:10]),
+    ),
+    Scenario(
+        "transpose-gzip",
+        {
+            "shape": (8, 12),
+            "chunks": (2, 4),
+            "shards": None,
+            "filters": [TransposeCodec(order=(1, 0))],
+            "serializer": BytesCodec(),
+            "compressors": GzipCodec(level=1),
+            **_I32,
+        },
+        writes=((slice(None), np.arange(96, dtype="int32").reshape(8, 12)),),
+        reads=(slice(None), np.s_[1:7, 2:10]),
+    ),
+    # --- nested sharding ----------------------------------------------------
+    Scenario(
+        "nested-sharding",
+        {
+            "shape": (20, 20),
+            "chunks": (10, 10),
+            "shards": None,
+            "compressors": None,
+            **_I32,
+            "fill_value": 0,
+            "serializer": ShardingCodec(
+                chunk_shape=(10, 10), codecs=[ShardingCodec(chunk_shape=(5, 5))]
+            ),
+        },
+        writes=((slice(None), np.arange(400, dtype="int32").reshape(20, 20)),),
+    ),
+    # --- partial overwrite of an existing shard (merge) ---------------------
+    Scenario(
+        "partial-shard-overwrite",
+        {
+            "shape": (40,),
+            "chunks": (4,),
+            "shards": (40,),
+            "compressors": None,
+            **_I32,
+            "config": {"write_empty_chunks": True},
+        },
+        writes=(
+            (slice(None), np.arange(40, dtype="int32")),
+            (slice(7, 18), _val(11, "int32", 700)),
+        ),
+    ),
+    # --- write_empty_chunks: storage-key presence/absence -------------------
+    Scenario(
+        "write-empty-false-omits-fill-chunk",
+        {
+            "shape": (20,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": None,
+            **_F64,
+            "config": {"write_empty_chunks": False},
+        },
+        writes=((slice(0, 10), _val(10, "float64")), (slice(10, 20), np.zeros(10, "float64"))),
+        keys_present=("c/0",),
+        keys_absent=("c/1",),
+    ),
+    Scenario(
+        "write-empty-true-persists-fill-chunk",
+        {
+            "shape": (20,),
+            "chunks": (10,),
+            "shards": None,
+            "compressors": None,
+            **_F64,
+            "config": {"write_empty_chunks": True},
+        },
+        writes=((slice(None), np.zeros(20, "float64")),),
+        keys_present=("c/0", "c/1"),
+    ),
+    # default config (no explicit write_empty_chunks) must still skip fill chunks
+    Scenario(
+        "default-config-omits-fill-chunk",
+        {"shape": (20,), "chunks": (10,), "shards": None, "compressors": None, **_F64},
+        writes=((slice(10, 20), np.zeros(10, "float64")),),
+        keys_absent=("c/1",),
+    ),
+)
+
+
+class CodecPipelineTests:
+    """Behavior every codec pipeline must satisfy, on sync and async stores.
+
+    Subclasses set ``pipeline_path`` to the fully-qualified pipeline class.
+    """
+
+    pipeline_path: str
+
+    @pytest.fixture(autouse=True)
+    def _use_pipeline(self) -> Iterator[None]:
+        with zarr_config.set({"codec_pipeline.path": self.pipeline_path}):
+            yield
+
+    @pytest.fixture(params=STORE_KINDS)
+    def store(self, request: pytest.FixtureRequest) -> Store:
+        return _make_store(request.param)
+
+    @staticmethod
+    def _chunk_keys(store: Store) -> set[str]:
+        """All non-metadata keys currently in the store (v3 and v2 metadata)."""
+        import asyncio
+
+        def _is_metadata(key: str) -> bool:
+            tail = key.rsplit("/", 1)[-1]
+            return tail in ("zarr.json", ".zarray", ".zattrs", ".zgroup", ".zmetadata")
+
+        async def _list() -> set[str]:
+            return {k async for k in store.list() if not _is_metadata(k)}
+
+        return asyncio.run(_list())
+
+    # -- the common shape: create -> write -> [assert keys] -> assert reads ----
+
+    @pytest.mark.parametrize("scenario", SCENARIOS, ids=lambda s: s.id)
+    def test_scenario(self, store: Store, scenario: Scenario) -> None:
+        """Create an array, apply the scenario's writes, optionally assert which
+        chunk keys exist, then assert each read selection matches a numpy
+        reference. Run against every pipeline (subclass) and store kind (fixture).
+        """
+        arr = zarr.create_array(store=store, **scenario.array_kwargs)
+        for sel, value in scenario.writes:
+            arr[sel] = value
+
+        ref = scenario.reference()
+        for sel in scenario.reads:
+            np.testing.assert_array_equal(
+                arr[sel], ref[sel], err_msg=f"{scenario.id}: read {sel!r} mismatch"
+            )
+
+        if scenario.keys_present or scenario.keys_absent:
+            keys = self._chunk_keys(store)
+            for present in scenario.keys_present:
+                assert any(present in k for k in keys), (present, keys)
+            for absent in scenario.keys_absent:
+                assert not any(absent in k for k in keys), (absent, keys)
+
+    # -- outliers that don't fit the create/write/read scenario shape ----------
+
+    def test_read_missing_chunks_false_raises(self, store: Store) -> None:
+        """read_missing_chunks=False makes reading an unwritten chunk an error,
+        not a fill — a different assertion (raises) than the scenario shape."""
+        arr = zarr.create_array(
+            store=store,
+            shape=(20,),
+            dtype="float64",
+            chunks=(10,),
+            shards=None,
+            compressors=None,
+            fill_value=0.0,
+            config={"read_missing_chunks": False},
+        )
+        with pytest.raises(ChunkNotFoundError):
+            arr[:]
+
+    def test_read_missing_chunks_false_sharded_semantics(self, store: Store) -> None:
+        """read_missing_chunks=False is a STORE-KEY-level promise on sharded arrays.
+
+        The config exists to help consumers distinguish a transport error from a
+        truly missing chunk. That distinction applies to store keys: a missing
+        SHARD key raises ChunkNotFoundError. It does not cleanly apply to inner
+        subchunks of a shard that was fetched successfully — there is no
+        transport ambiguity there, the shard index simply records the subchunk
+        as absent — so missing inner subchunks fill with the fill value rather
+        than raising. This pins that asymmetry as intentional.
+        """
+        arr = zarr.create_array(
+            store=store,
+            shape=(100,),
+            dtype="float64",
+            chunks=(10,),
+            shards=(50,),
+            compressors=None,
+            fill_value=-1.0,
+            config={"read_missing_chunks": False},
+        )
+        # No shard key exists yet: reading is a missing-store-key error.
+        with pytest.raises(ChunkNotFoundError):
+            arr[:]
+
+        # Write one inner chunk of the first shard. The shard key now exists,
+        # but most inner subchunks are absent from its index.
+        arr[20:30] = np.arange(10, dtype="float64")
+
+        # Reading across written + absent inner subchunks of the EXISTING shard
+        # fills rather than raises.
+        out = arr[15:35]
+        expected = np.full(20, -1.0)
+        expected[5:15] = np.arange(10, dtype="float64")
+        np.testing.assert_array_equal(out, expected)
+
+        # Both halves of the asymmetry in ONE read against the SAME partially
+        # written array: shard 0 exists (absent subchunks fill), shard 1 has no
+        # store key (raises) — pins that the raise still fires once some shard
+        # exists, and not only on a fully-empty array.
+        with pytest.raises(ChunkNotFoundError):
+            arr[:]
+        with pytest.raises(ChunkNotFoundError):
+            arr[45:55]  # spans the existing and the missing shard
+
+    @pytest.mark.parametrize("subchunk_write_order", ["morton", "lexicographic", "colexicographic"])
+    def test_partial_write_after_reopen_is_correct(
+        self, store: Store, subchunk_write_order: SubchunkWriteOrder
+    ) -> None:
+        """Has an extra step the scenario shape lacks — a REOPEN between writes.
+
+        Reopening a sharded array and partially overwriting it must read back
+        correctly regardless of the original subchunk_write_order. subchunk_write_
+        order is intentionally NOT recoverable on reopen, so chunk locations on a
+        write to an existing shard must come from the STORED shard index, not the
+        (now-default) live order. A non-square inner grid makes the orders
+        physically distinct, so a wrong offset would corrupt data and fail here.
+        """
+        shape, shard, inner = (6, 4), (6, 4), (2, 2)
+        arr = zarr.create_array(
+            store=store,
+            shape=shape,
+            dtype="int32",
+            chunks=shard,
+            fill_value=-1,
+            compressors=None,
+            config={"write_empty_chunks": True},
+            serializer=ShardingCodec(
+                chunk_shape=inner, codecs=[BytesCodec()], subchunk_write_order=subchunk_write_order
+            ),
+        )
+        ref = np.arange(24, dtype="int32").reshape(shape)
+        arr[:] = ref
+
+        reopened = zarr.open_array(store=store, mode="r+")
+        reopened[1:5, 0:3] = 777  # partial overwrite into the existing shard
+        ref[1:5, 0:3] = 777
+        np.testing.assert_array_equal(reopened[:], ref)
+
+    def test_empty_shard_deleted_after_overwrite_to_fill(self, store: Store) -> None:
+        """A shard written with real data and then fully overwritten back to the
+        fill value must have its store key deleted, not left as a stale blob.
+
+        This has a mid-sequence key assertion (present after write 1, absent
+        after write 2) that the create/write/read scenario shape can't express.
+        """
+        arr = zarr.create_array(
+            store=store,
+            shape=(16,),
+            chunks=(4,),
+            shards=(8,),
+            dtype="float64",
+            compressors=None,
+            fill_value=0.0,
+        )
+        arr[0:8] = np.arange(8, dtype="float64") + 1
+        assert any("c/0" in k for k in self._chunk_keys(store))
+        arr[0:8] = 0.0
+        assert not any("c/0" in k for k in self._chunk_keys(store)), (
+            "shard should be deleted when fully overwritten to fill value"
+        )
+
+    def test_read_write_methods_do_not_branch_on_sharding_codec_type(self) -> None:
+        """Pipeline read/write must dispatch on supports_partial_encode/decode,
+        not isinstance(ShardingCodec) — a static guard against type-branching.
+
+        Scoped to this pipeline's own read/write methods (other helpers, e.g.
+        metadata validation, may legitimately isinstance-check ShardingCodec).
+        """
+        import inspect
+        import re
+
+        from zarr.registry import get_pipeline_class
+
+        # The autouse _use_pipeline fixture has set codec_pipeline.path to this
+        # subclass's pipeline; resolve the class it points at and guard that.
+        # reload_config=False so the fixture's config override is honored
+        # (reload_config=True re-reads the base config, ignoring the override).
+        cls = get_pipeline_class(reload_config=False)
+
+        pattern = re.compile(r"isinstance\s*\([^)]*ShardingCodec[^)]*\)")
+        for method_name in ("read", "write", "read_sync", "write_sync"):
+            method = getattr(cls, method_name, None)
+            if method is None:
+                continue
+            matches = pattern.findall(inspect.getsource(method))
+            assert not matches, (
+                f"{cls.__name__}.{method_name} contains an isinstance check on "
+                f"ShardingCodec; use supports_partial_encode/decode instead. "
+                f"Matches: {matches}"
+            )
+
+
+class TestBatchedPipeline(CodecPipelineTests):
+    pipeline_path = "zarr.core.codec_pipeline.BatchedCodecPipeline"
+
+
+class TestFusedPipeline(CodecPipelineTests):
+    pipeline_path = "zarr.core.codec_pipeline.FusedCodecPipeline"
diff --git a/tests/test_codecs/test_codecs.py b/tests/test_codecs/test_codecs.py
index 6e3e3f6d28..e1177b087e 100644
--- a/tests/test_codecs/test_codecs.py
+++ b/tests/test_codecs/test_codecs.py
@@ -402,3 +402,41 @@ async def test_resize(store: Store) -> None:
     assert await store.get(f"{path}/0.1", prototype=default_buffer_prototype()) is not None
     assert await store.get(f"{path}/1.0", prototype=default_buffer_prototype()) is None
     assert await store.get(f"{path}/1.1", prototype=default_buffer_prototype()) is None
+
+
+def _resolve_metadata_codecs() -> list[Codec]:
+    from zarr.codecs.crc32c_ import Crc32cCodec
+    from zarr.codecs.zstd import ZstdCodec
+
+    return [
+        BytesCodec(),
+        GzipCodec(level=1),
+        TransposeCodec(order=(0,)),
+        Crc32cCodec(),
+        ZstdCodec(level=1),
+    ]
+
+
+@pytest.mark.parametrize("codec", _resolve_metadata_codecs(), ids=lambda c: type(c).__name__)
+def test_resolve_metadata_only_mutates_shape(codec: Codec) -> None:
+    """A codec's resolve_metadata may change a chunk's `shape` but must leave the
+    prototype, dtype, fill_value, and config untouched -- the pipeline relies on
+    those being stable across the codec chain.
+    """
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec_in = ArraySpec(
+        shape=(10,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0.0),
+        config=ArrayConfig(order="C", write_empty_chunks=False),
+        prototype=default_buffer_prototype(),
+    )
+    spec_out = codec.resolve_metadata(spec_in)
+    name = type(codec).__name__
+    assert spec_out.prototype is spec_in.prototype, f"{name} changed prototype"
+    assert spec_out.dtype == spec_in.dtype, f"{name} changed dtype"
+    assert spec_out.fill_value == spec_in.fill_value, f"{name} changed fill_value"
+    assert spec_out.config == spec_in.config, f"{name} changed config"
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
index 856d29ef7a..c65bf3de43 100644
--- a/tests/test_codecs/test_sharding.py
+++ b/tests/test_codecs/test_sharding.py
@@ -28,6 +28,64 @@
 from .test_codecs import _AsyncArrayProxy, order_from_dim
 
 
+def _reads_are_sync(store_mock: AsyncMock) -> bool:
+    """True when the partial-shard read for this store+pipeline goes through the
+    synchronous methods (get_sync / get_ranges_sync). That requires BOTH the
+    configured pipeline to be the sync (Fused) one AND the store to support sync
+    reads — a Fused read against a non-sync store (e.g. ZipStore) falls back to
+    the async path. Lets the partial-shard-read tests assert the same intent
+    against whichever method family is actually exercised."""
+    from zarr.abc.store import SupportsGetSync
+    from zarr.core.config import config
+
+    pipeline_is_sync = "Fused" in config.get("codec_pipeline.path")
+    # store_mock wraps the real store; check the wrapped class for sync support.
+    wrapped = getattr(store_mock, "_mock_wraps", store_mock)
+    return pipeline_is_sync and isinstance(wrapped, SupportsGetSync)
+
+
+def _index_read_count(store_mock: AsyncMock) -> int:
+    """Number of shard-index reads, regardless of sync/async pipeline."""
+    method = store_mock.get_sync if _reads_are_sync(store_mock) else store_mock.get
+    return int(method.call_count)
+
+
+def _range_read_count(store_mock: AsyncMock) -> int:
+    """Number of coalesced chunk-data reads, regardless of sync/async pipeline."""
+    method = store_mock.get_ranges_sync if _reads_are_sync(store_mock) else store_mock.get_ranges
+    return int(method.call_count)
+
+
+def _fail_index_read(store_mock: AsyncMock) -> None:
+    """Simulate the shard-index load returning nothing, for the active path."""
+    if _reads_are_sync(store_mock):
+        store_mock.get_sync.return_value = None
+    else:
+        store_mock.get.return_value = None
+
+
+def _fail_chunk_reads(
+    store_mock: AsyncMock, key_absent_exc: type[Exception] = FileNotFoundError
+) -> None:
+    """Simulate chunk-data loads failing (key absent), for the active path.
+
+    Async get_ranges raises a BaseExceptionGroup; the sync get_ranges_sync mirrors
+    that contract, so both inject a FileNotFoundError-bearing group."""
+    if _reads_are_sync(store_mock):
+
+        def fail_sync(key: str, byte_ranges: Any, **kwargs: Any) -> Any:
+            raise BaseExceptionGroup("chunk read failed", [key_absent_exc(key)])
+
+        store_mock.get_ranges_sync = fail_sync
+    else:
+
+        async def fail_async(key: str, byte_ranges: Any, **kwargs: Any) -> Any:
+            raise BaseExceptionGroup("chunk read failed", [key_absent_exc(key)])
+            yield  # type: ignore[unreachable]  # marks this as an async generator
+
+        store_mock.get_ranges = fail_async
+
+
 @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
 @pytest.mark.parametrize("index_location", ["start", "end"])
 @pytest.mark.parametrize(
@@ -231,18 +289,18 @@ def test_sharding_multiple_chunks_partial_shard_read(
     # for a total of 6 chunks accessed
     assert np.allclose(a[0, 22:42], np.arange(22, 42, dtype="float32"))
 
-    # 2 shard index reads via store.get() + 2 get_ranges calls (one per shard)
-    assert store_mock.get.call_count == 2
-    assert store_mock.get_ranges.call_count == 2
+    # 2 shard index reads + 2 coalesced chunk-data reads (one per shard)
+    assert _index_read_count(store_mock) == 2
+    assert _range_read_count(store_mock) == 2
 
     store_mock.reset_mock()
 
     # Reads 4 chunks from both shards along dimension 0 for a total of 8 chunks accessed
     assert np.allclose(a[:, 0], np.arange(0, data.size, array_shape[1], dtype="float32"))
 
-    # 2 shard index reads via store.get() + 2 get_ranges calls (one per shard)
-    assert store_mock.get.call_count == 2
-    assert store_mock.get_ranges.call_count == 2
+    # 2 shard index reads + 2 coalesced chunk-data reads (one per shard)
+    assert _index_read_count(store_mock) == 2
+    assert _range_read_count(store_mock) == 2
 
 
 @pytest.mark.parametrize("index_location", ["start", "end"])
@@ -278,9 +336,9 @@ def test_sharding_duplicate_read_indexes(
     indexer = [8, 8, 12, 12]
     assert np.array_equal(a[indexer], data[indexer])
 
-    # 1 shard index read via store.get() + 1 get_ranges call
-    assert store_mock.get.call_count == 1
-    assert store_mock.get_ranges.call_count == 1
+    # 1 shard index read + 1 coalesced chunk-data read
+    assert _index_read_count(store_mock) == 1
+    assert _range_read_count(store_mock) == 1
 
 
 @pytest.mark.parametrize("index_location", ["start", "end"])
@@ -369,8 +427,6 @@ def test_sharding_partial_shard_read__index_load_fails(
     fill_value = -999
 
     store_mock = AsyncMock(wraps=store, spec=store.__class__)
-    # loading the index is the first call to .get() so returning None will simulate an index load failure
-    store_mock.get.return_value = None
 
     a = zarr.create_array(
         StorePath(store_mock),
@@ -383,6 +439,10 @@ def test_sharding_partial_shard_read__index_load_fails(
     )
     a[:] = data
 
+    # Loading the index returns None -> simulate an index load failure, on
+    # whichever read method the active pipeline uses (get / get_sync).
+    _fail_index_read(store_mock)
+
     # Read from one of two chunks in a shard to test the partial shard read path
     assert a[0] == fill_value
     assert a[0] != data[0]
@@ -449,16 +509,12 @@ def test_sharding_partial_shard_read__chunk_load_fails(
     a[:] = data
 
     # Set up store mock after array creation to simulate chunk load failure.
-    # Index loads still succeed (via store.get), but chunk-byte loads fail
-    # (via store.get_ranges raising BaseExceptionGroup containing FileNotFoundError —
-    # the same shape Store.get_ranges produces when a key is absent).
+    # Index loads still succeed, but chunk-byte loads fail (the coalesced range
+    # read raises a BaseExceptionGroup containing FileNotFoundError — the same
+    # shape produced when a key is absent), on whichever read method the active
+    # pipeline uses (get_ranges / get_ranges_sync).
     store_mock.reset_mock()
-
-    async def fail_chunk_reads(key: str, byte_ranges: Any, **kwargs: Any) -> Any:
-        raise BaseExceptionGroup("chunk read failed", [FileNotFoundError(key)])
-        yield  # type: ignore[unreachable]  # marks this as an async generator
-
-    store_mock.get_ranges = fail_chunk_reads
+    _fail_chunk_reads(store_mock)
 
     # Read from one of two chunks in a shard to test the partial shard read path
     assert a[0] == fill_value
diff --git a/tests/test_codecs/test_sharding_unit.py b/tests/test_codecs/test_sharding_unit.py
index 6e022ed9fa..6736b1047c 100644
--- a/tests/test_codecs/test_sharding_unit.py
+++ b/tests/test_codecs/test_sharding_unit.py
@@ -1,6 +1,14 @@
+import asyncio
+from dataclasses import dataclass, replace
+from typing import Any
+
 import numpy as np
 import pytest
 
+from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.crc32c_ import Crc32cCodec
+from zarr.codecs.gzip import GzipCodec
 from zarr.codecs.sharding import (
     MAX_UINT_64,
     ShardingCodec,
@@ -8,8 +16,12 @@
     _ShardingByteGetter,
     _ShardReader,
 )
-from zarr.core.buffer import default_buffer_prototype
+from zarr.core.array_spec import ArrayConfig, ArraySpec
+from zarr.core.buffer import Buffer as ABCBuffer
+from zarr.core.buffer import NDBuffer, default_buffer_prototype
 from zarr.core.buffer.cpu import Buffer
+from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+from zarr.core.dtype import get_data_type_from_native_dtype
 from zarr.storage._common import StorePath
 from zarr.storage._memory import MemoryStore
 
@@ -486,3 +498,254 @@ def test_is_total_shard_1d() -> None:
     # Partial
     partial_coords: set[tuple[int, ...]] = {(0,), (2,)}
     assert codec._is_total_shard(partial_coords, chunks_per_shard) is False
+
+
+# ============================================================================
+# _inner_codecs_fixed_size tests
+# ============================================================================
+
+
+def test_inner_codecs_fixed_size_no_compression() -> None:
+    """Inner codecs without compression should be fixed-size."""
+    codec = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec()])
+    assert codec._inner_codecs_fixed_size is True
+
+
+def test_inner_codecs_fixed_size_with_compression() -> None:
+    """Inner codecs with compression should NOT be fixed-size."""
+    codec = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec(), GzipCodec()])
+    assert codec._inner_codecs_fixed_size is False
+
+
+# ============================================================================
+# inner-chain spec threading
+# ============================================================================
+
+
+@dataclass(frozen=True)
+class _WidenToInt16(ArrayArrayCodec):
+    """Test-only sync-capable AA codec that reports its output dtype as int16."""
+
+    is_fixed_size = True
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"name": "_widen_to_int16"}
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "_WidenToInt16":
+        return cls()
+
+    def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
+        return replace(chunk_spec, dtype=get_data_type_from_native_dtype(np.dtype("int16")))
+
+    def compute_encoded_size(self, input_byte_length: int, _spec: ArraySpec) -> int:
+        return input_byte_length
+
+    def _encode_sync(self, chunk_array: Any, chunk_spec: ArraySpec) -> Any:
+        return chunk_array  # pragma: no cover
+
+    def _decode_sync(self, chunk_array: Any, chunk_spec: ArraySpec) -> Any:
+        return chunk_array  # pragma: no cover
+
+    async def _encode_single(self, chunk_array: Any, chunk_spec: ArraySpec) -> Any:
+        return chunk_array  # pragma: no cover
+
+    async def _decode_single(self, chunk_array: Any, chunk_spec: ArraySpec) -> Any:
+        return chunk_array  # pragma: no cover
+
+
+def _int8_spec(shape: tuple[int, ...]) -> ArraySpec:
+    zdtype = get_data_type_from_native_dtype(np.dtype("int8"))  # single-byte source
+    return ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=False),
+        prototype=default_buffer_prototype(),
+    )
+
+
+def test_inner_chunk_transform_threads_spec() -> None:
+    """The inner codec chain must be evolved with the spec threaded forward.
+
+    A dtype-widening inner array->array codec means the BytesCodec serializer
+    is evolved against the WIDENED dtype, not the single-byte source —
+    otherwise it strips its `endian` to None and fails to decode multi-byte
+    inner chunks. Same contract as the pipeline-level `evolve_codecs`
+    regression test, applied to `_get_inner_chunk_transform`.
+    """
+    codec = ShardingCodec(chunk_shape=(4,), codecs=[_WidenToInt16(), BytesCodec(endian="little")])
+    shard_spec = _int8_spec((8,))
+
+    transform = codec._get_inner_chunk_transform(shard_spec)
+    serializer = transform._ab_codec
+    assert isinstance(serializer, BytesCodec)
+    assert serializer.endian is not None, (
+        "inner BytesCodec lost its `endian` — _get_inner_chunk_transform did not "
+        "thread the dtype-widening codec's spec into the serializer"
+    )
+
+
+def test_evolve_from_array_spec_threads_spec() -> None:
+    """`ShardingCodec.evolve_from_array_spec` must thread the spec through the
+    inner chain, like `_get_inner_chunk_transform` does.
+
+    This method runs EARLIER, on the real array-creation path (the outer
+    pipeline evolves the sharding codec itself), so an unthreaded evolve here
+    bakes an endian-stripped BytesCodec into the evolved instance's `codecs`
+    before the transform builders ever run — and the later threaded evolve then
+    raises instead of recovering. Calling `_get_inner_chunk_transform` on the
+    EVOLVED instance pins the full real path.
+    """
+    codec = ShardingCodec(chunk_shape=(4,), codecs=[_WidenToInt16(), BytesCodec(endian="little")])
+    # the array spec the OUTER pipeline evolves the sharding codec against
+    array_spec = _int8_spec((8,))
+
+    evolved = codec.evolve_from_array_spec(array_spec)
+    inner_serializer = next(c for c in evolved.codecs if isinstance(c, BytesCodec))
+    assert inner_serializer.endian is not None, (
+        "evolve_from_array_spec evolved the inner BytesCodec against the "
+        "un-widened spec, stripping its `endian`"
+    )
+
+    # and the evolved instance must still build a working inner transform
+    transform = evolved._get_inner_chunk_transform(array_spec)
+    serializer = transform._ab_codec
+    assert isinstance(serializer, BytesCodec)
+    assert serializer.endian is not None
+
+
+# ============================================================================
+# async whole-shard codec methods
+#
+# `ShardingCodec` advertises partial decode/encode, so the codec pipeline
+# always routes sharded reads/writes through `_decode_partial_single` /
+# `_encode_partial_single`. The whole-shard async methods `_decode_single` /
+# `_encode_single` are reached only via the direct `ArrayBytesCodec` API (e.g.
+# a consumer that calls the codec outside a pipeline), so they get no coverage
+# from end-to-end array tests. Pin them with a direct round-trip.
+# ============================================================================
+
+
+@pytest.mark.parametrize("write_empty_chunks", [True, False])
+def test_decode_single_encode_single_roundtrip(write_empty_chunks: bool) -> None:
+    """`ShardingCodec._encode_single` then `_decode_single` round-trips a whole
+    shard. Covers the async whole-shard path the pipeline bypasses in favor of
+    the partial methods."""
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(50,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=write_empty_chunks),
+        prototype=default_buffer_prototype(),
+    )
+    codec = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec()])
+    data = np.arange(50, dtype="float64")
+    value = CPUNDBuffer.from_numpy_array(data)
+
+    encoded = asyncio.run(codec._encode_single(value, spec))
+    assert encoded is not None  # data is non-empty -> a shard is always produced
+    decoded = asyncio.run(codec._decode_single(encoded, spec))
+    np.testing.assert_array_equal(decoded.as_numpy_array(), data)
+
+
+def test_encode_single_all_empty_returns_none() -> None:
+    """`_encode_single` of an all-fill shard under write_empty_chunks=False
+    elides every inner chunk and returns None (the all-empty branch)."""
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(50,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=False),
+        prototype=default_buffer_prototype(),
+    )
+    codec = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec()])
+    fill = CPUNDBuffer.from_numpy_array(np.zeros(50, dtype="float64"))
+
+    assert asyncio.run(codec._encode_single(fill, spec)) is None
+
+
+def test_decode_single_all_empty_fills() -> None:
+    """`_decode_single` of a shard whose index is all-empty fills the output
+    with the fill value (the is_all_empty fast path)."""
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(50,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(-1.0),
+        config=ArrayConfig(order="C", write_empty_chunks=False),
+        prototype=default_buffer_prototype(),
+    )
+    codec = ShardingCodec(chunk_shape=(10,), codecs=[BytesCodec()])
+    # an empty shard is just the encoded empty index
+    empty_index = asyncio.run(codec._encode_shard_index(_ShardIndex.create_empty((5,))))
+    decoded = asyncio.run(codec._decode_single(empty_index, spec))
+    np.testing.assert_array_equal(decoded.as_numpy_array(), np.full(50, -1.0))
+
+
+# ============================================================================
+# async-only index codec fallback (#269)
+#
+# `_decode_shard_index` / `_encode_shard_index` delegate to their sync twins
+# when every index codec is sync-capable, and otherwise fall back to the async
+# pipeline. The default index chain (bytes + crc32c) is sync-capable, so the
+# fallback is exercised only by an async-only index codec.
+# ============================================================================
+
+
+class _AsyncOnlyBytesCodec(ArrayBytesCodec):
+    """An array<->bytes codec that implements ONLY the async per-chunk methods.
+
+    Wraps a real `BytesCodec` for the actual conversion but deliberately omits
+    `_encode_sync`/`_decode_sync`, so it is NOT a `SupportsSyncCodec`. Used as
+    an index codec to force the async-pipeline fallback in
+    `_decode_shard_index`/`_encode_shard_index`.
+    """
+
+    _inner = BytesCodec()
+
+    def to_dict(self) -> dict[str, Any]:
+        return {"name": "_async_only_bytes"}
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "_AsyncOnlyBytesCodec":
+        return cls()
+
+    def evolve_from_array_spec(self, array_spec: ArraySpec) -> "_AsyncOnlyBytesCodec":
+        return self
+
+    def compute_encoded_size(self, input_byte_length: int, _spec: ArraySpec) -> int:
+        return input_byte_length
+
+    async def _decode_single(self, chunk_bytes: ABCBuffer, chunk_spec: ArraySpec) -> NDBuffer:
+        return await self._inner._decode_single(chunk_bytes, chunk_spec)
+
+    async def _encode_single(self, chunk_array: NDBuffer, chunk_spec: ArraySpec) -> ABCBuffer:
+        result = await self._inner._encode_single(chunk_array, chunk_spec)
+        assert result is not None
+        return result
+
+
+def test_shard_index_async_fallback_for_async_only_index_codec() -> None:
+    """An async-only index codec is not sync-capable, so `_encode_shard_index`
+    and `_decode_shard_index` must take the async-pipeline fallback (#269)
+    instead of the sync twins — and still round-trip."""
+    from zarr.abc.codec import SupportsSyncCodec
+
+    codec = ShardingCodec(
+        chunk_shape=(10,),
+        codecs=[BytesCodec()],
+        index_codecs=[_AsyncOnlyBytesCodec(), Crc32cCodec()],
+    )
+    assert not codec._index_codecs_sync_capable()
+    assert not isinstance(_AsyncOnlyBytesCodec(), SupportsSyncCodec)
+
+    chunks_per_shard = (5,)
+    index = _ShardIndex.create_empty(chunks_per_shard)
+    index.set_chunk_slice((0,), slice(0, 42))
+
+    encoded = asyncio.run(codec._encode_shard_index(index))
+    decoded = asyncio.run(codec._decode_shard_index(encoded, chunks_per_shard))
+    np.testing.assert_array_equal(decoded.offsets_and_lengths, index.offsets_and_lengths)
diff --git a/tests/test_config.py b/tests/test_config.py
index 4e293e968f..c0cb0a30bb 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -61,8 +61,9 @@ def test_config_defaults_set() -> None:
                 "threading": {"max_workers": None},
                 "json_indent": 2,
                 "codec_pipeline": {
-                    "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
+                    "path": "zarr.core.codec_pipeline.FusedCodecPipeline",
                     "batch_size": 1,
+                    "max_workers": 1,
                 },
                 "codecs": {
                     "blosc": "zarr.codecs.blosc.BloscCodec",
@@ -134,7 +135,7 @@ def test_config_codec_pipeline_class(store: Store) -> None:
     # has default value
     assert get_pipeline_class().__name__ != ""
 
-    config.set({"codec_pipeline.name": "zarr.core.codec_pipeline.BatchedCodecPipeline"})
+    config.set({"codec_pipeline.path": "zarr.core.codec_pipeline.BatchedCodecPipeline"})
     assert get_pipeline_class() == zarr.core.codec_pipeline.BatchedCodecPipeline
 
     _mock = Mock()
@@ -189,10 +190,19 @@ def test_config_codec_implementation(store: Store) -> None:
     _mock = Mock()
 
     class MockBloscCodec(BloscCodec):
+        # Record a call from whichever encode entry point the active codec
+        # pipeline uses: the async `_encode_single` (BatchedCodecPipeline) or the
+        # synchronous `_encode_sync` (FusedCodecPipeline, the default). Overriding
+        # both keeps this test ("the configured codec is actually used")
+        # independent of which pipeline is the default.
         async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None:
             _mock.call()
             return None
 
+        def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None:
+            _mock.call()
+            return None
+
     register_codec("blosc", MockBloscCodec)
     with config.set({"codecs.blosc": fully_qualified_name(MockBloscCodec)}):
         assert get_codec_class("blosc") == MockBloscCodec
diff --git a/tests/test_fastpath_equivalence.py b/tests/test_fastpath_equivalence.py
new file mode 100644
index 0000000000..e822ca6e68
--- /dev/null
+++ b/tests/test_fastpath_equivalence.py
@@ -0,0 +1,309 @@
+"""Property tests: every fast path must equal the general path.
+
+The codec pipelines contain fast paths that skip work whose result is known —
+the complete-chunk merge view, the vectorized whole-shard bulk decode, the
+scalar-broadcast write memoization, byte-range coalescing. Each is only safe if
+it produces results identical to the general path it bypasses. These tests pin
+that equivalence on randomized inputs, so a fast path that silently diverges
+(the bug class behind the bulk-decode endianness fix) fails here instead of
+corrupting data downstream.
+
+Convention for new fast paths: a fast path is "skip work whose result is
+known", never "a different algorithm" — and it ships with a property test in
+this module asserting equality with the general path.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import hypothesis.extra.numpy as npst
+import hypothesis.strategies as st
+import numpy as np
+from hypothesis import given, settings
+
+import zarr
+from zarr.abc.store import OffsetByteRequest, RangeByteRequest, SuffixByteRequest
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.sharding import ShardingCodec, ShardingCodecIndexLocation
+from zarr.core.array_spec import ArrayConfig, ArraySpec
+from zarr.core.buffer import default_buffer_prototype
+from zarr.core.buffer.cpu import Buffer as CPUBuffer
+from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+from zarr.core.chunk_grids import ChunkGrid
+from zarr.core.codec_pipeline import _merge_chunk_array
+from zarr.core.dtype import get_data_type_from_native_dtype
+from zarr.core.indexing import BasicIndexer
+from zarr.storage import MemoryStore
+
+_DTYPES = st.sampled_from(["uint8", "int16", "float32"])
+
+
+def _spec(shape: tuple[int, ...], dtype: str, *, write_empty_chunks: bool = True) -> ArraySpec:
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    return ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=write_empty_chunks),
+        prototype=default_buffer_prototype(),
+    )
+
+
+# ---------------------------------------------------------------------------
+# _merge_chunk_array: the complete-chunk early return (a view of
+# value[out_selection]) must equal the general create/copy + setitem path.
+# ---------------------------------------------------------------------------
+
+
+@st.composite
+def _merge_cases(draw: st.DrawFn) -> tuple[np.ndarray, tuple[int, ...], tuple[slice, ...]]:
+    ndim = draw(st.integers(1, 3))
+    chunk_shape = tuple(draw(st.integers(1, 5)) for _ in range(ndim))
+    n_blocks = draw(st.integers(1, 3))
+    dtype = draw(_DTYPES)
+    # value spans n_blocks chunk-sized blocks along axis 0; out_selection picks one
+    value_shape = (chunk_shape[0] * n_blocks, *chunk_shape[1:])
+    value = draw(npst.arrays(dtype=np.dtype(dtype), shape=value_shape))
+    block = draw(st.integers(0, n_blocks - 1))
+    out_selection = (
+        slice(block * chunk_shape[0], (block + 1) * chunk_shape[0]),
+        *(slice(0, s) for s in chunk_shape[1:]),
+    )
+    return value, chunk_shape, out_selection
+
+
+@settings(max_examples=200, deadline=None)
+@given(case=_merge_cases(), with_existing=st.booleans())
+def test_merge_complete_chunk_equals_general_path(
+    case: tuple[np.ndarray, tuple[int, ...], tuple[slice, ...]], with_existing: bool
+) -> None:
+    """The is_complete_chunk fast path (return a view of value[out_selection])
+    must produce exactly what the general merge path produces — and a complete
+    write must be independent of any existing chunk content."""
+    value_np, chunk_shape, out_selection = case
+    spec = _spec(chunk_shape, str(value_np.dtype))
+    value = CPUNDBuffer.from_numpy_array(value_np)
+    chunk_selection = tuple(slice(0, s) for s in chunk_shape)
+
+    existing = None
+    if with_existing:
+        existing = CPUNDBuffer.from_numpy_array(np.full(chunk_shape, 7, dtype=value_np.dtype))
+
+    fast = _merge_chunk_array(existing, value, out_selection, spec, chunk_selection, True, ())
+    general = _merge_chunk_array(existing, value, out_selection, spec, chunk_selection, False, ())
+    np.testing.assert_array_equal(fast.as_numpy_array(), general.as_numpy_array())
+    np.testing.assert_array_equal(fast.as_numpy_array(), value_np[out_selection])
+
+
+# ---------------------------------------------------------------------------
+# _decode_full_shard_bulk: the vectorized dense-shard decode must equal the
+# general per-chunk decode (_decode_sync), across dtypes, endianness, write
+# orders, and index locations. This is the bug class of the historical
+# bulk-decode endianness fix.
+# ---------------------------------------------------------------------------
+
+
+@st.composite
+def _shard_cases(draw: st.DrawFn) -> dict[str, Any]:
+    ndim = draw(st.integers(1, 2))
+    chunk_shape = tuple(draw(st.integers(1, 4)) for _ in range(ndim))
+    grid = tuple(draw(st.integers(1, 3)) for _ in range(ndim))
+    shard_shape = tuple(c * g for c, g in zip(chunk_shape, grid, strict=True))
+    dtype = draw(_DTYPES)
+    data = draw(npst.arrays(dtype=np.dtype(dtype), shape=shard_shape))
+    return {
+        "chunk_shape": chunk_shape,
+        "shard_shape": shard_shape,
+        "data": data,
+        "endian": draw(st.sampled_from(["little", "big"])),
+        "index_location": draw(st.sampled_from(list(ShardingCodecIndexLocation))),
+        "subchunk_write_order": draw(
+            st.sampled_from(["morton", "lexicographic", "colexicographic", "unordered"])
+        ),
+    }
+
+
+@settings(max_examples=100, deadline=None)
+@given(case=_shard_cases())
+def test_bulk_shard_decode_equals_general_decode(case: dict[str, Any]) -> None:
+    """For dense fixed-size uncompressed shards, the vectorized bulk decode must
+    reproduce the general per-chunk decode exactly, whatever the endianness,
+    subchunk write order, or index location."""
+    codec = ShardingCodec(
+        chunk_shape=case["chunk_shape"],
+        codecs=[BytesCodec(endian=case["endian"])],
+        index_location=case["index_location"],
+        subchunk_write_order=case["subchunk_write_order"],
+    )
+    spec = _spec(case["shard_shape"], str(case["data"].dtype), write_empty_chunks=True)
+    blob = codec._encode_sync(CPUNDBuffer.from_numpy_array(case["data"]), spec)
+    assert blob is not None  # write_empty_chunks=True -> dense, never elided
+
+    general = codec._decode_sync(blob, spec)
+    indexer = BasicIndexer(
+        tuple(slice(0, s) for s in case["shard_shape"]),
+        shape=case["shard_shape"],
+        chunk_grid=ChunkGrid.from_sizes(case["shard_shape"], case["chunk_shape"]),
+    )
+    bulk = codec._decode_full_shard_bulk(blob, spec, indexer)
+    # the fast path must APPLY for this dense uncompressed configuration —
+    # a vacuous None would silently stop testing the equivalence
+    assert bulk is not None
+    np.testing.assert_array_equal(bulk.as_numpy_array(), general.as_numpy_array())
+    np.testing.assert_array_equal(general.as_numpy_array(), case["data"])
+
+
+def test_merge_complete_chunk_returns_view_and_write_does_not_mutate_source() -> None:
+    """The complete-chunk merge fast path returns a VIEW of the caller's value
+    (no copy — that is the perf win), and a multi-chunk write through either
+    pipeline leaves the user's source array untouched.
+
+    Pins both halves of the aliasing contract: a future "defensive copy"
+    refactor that silently reintroduces the per-chunk create/fill/copy breaks
+    the first assertion, and an in-place-mutating codec that corrupts the
+    user's array through the shared view breaks the second.
+    """
+    # the fast path must return a view aliasing `value`, not a copy
+    value_np = np.arange(30, dtype="uint16")
+    spec = _spec((10,), "uint16")
+    value = CPUNDBuffer.from_numpy_array(value_np)
+    merged = _merge_chunk_array(None, value, (slice(10, 20),), spec, (slice(0, 10),), True, ())
+    assert np.shares_memory(merged.as_numpy_array(), value_np), (
+        "complete-chunk merge no longer returns a view of the caller's value"
+    )
+
+    # end-to-end: the source array is byte-identical after a multi-chunk write
+    for pipeline_path in (
+        "zarr.core.codec_pipeline.FusedCodecPipeline",
+        "zarr.core.codec_pipeline.BatchedCodecPipeline",
+    ):
+        with zarr.config.set({"codec_pipeline.path": pipeline_path}):
+            arr = zarr.create_array(
+                store=MemoryStore(),
+                shape=(30,),
+                chunks=(10,),
+                dtype="uint16",
+                compressors=None,
+                fill_value=0,
+            )
+            source = np.arange(30, dtype="uint16")
+            snapshot = source.copy()
+            arr[:] = source
+            np.testing.assert_array_equal(source, snapshot, err_msg=pipeline_path)
+            np.testing.assert_array_equal(arr[:], snapshot, err_msg=pipeline_path)
+
+
+# ---------------------------------------------------------------------------
+# Scalar-broadcast write memoization: writing a scalar must produce the same
+# STORED BYTES as writing the equivalent broadcast array.
+# ---------------------------------------------------------------------------
+
+
+@st.composite
+def _scalar_cases(draw: st.DrawFn) -> dict[str, Any]:
+    n_chunks = draw(st.integers(2, 6))
+    chunk = draw(st.integers(2, 6))
+    shape = n_chunks * chunk
+    start = draw(st.integers(0, shape - 1))
+    stop = draw(st.integers(start + 1, shape))
+    return {
+        "shape": shape,
+        "chunk": chunk,
+        "sel": slice(start, stop),
+        "scalar": draw(st.integers(0, 255)),
+        "write_empty_chunks": draw(st.booleans()),
+        "sharded": draw(st.booleans()),
+    }
+
+
+@settings(max_examples=100, deadline=None)
+@given(case=_scalar_cases())
+def test_scalar_write_equals_broadcast_write(case: dict[str, Any]) -> None:
+    """arr[sel] = scalar and arr[sel] = full(sel_shape, scalar) must leave the
+    store byte-identical (pins the scalar-broadcast memoization in the sharded
+    partial-write path, incl. its empty-chunk normalization)."""
+
+    def build() -> tuple[MemoryStore, zarr.Array[Any]]:
+        store = MemoryStore()
+        arr = zarr.create_array(
+            store=store,
+            shape=(case["shape"],),
+            chunks=(case["chunk"],),
+            shards=(case["shape"],) if case["sharded"] else None,
+            dtype="uint8",
+            compressors=None,
+            fill_value=0,
+            config={"write_empty_chunks": case["write_empty_chunks"]},
+        )
+        return store, arr
+
+    store_a, arr_a = build()
+    arr_a[case["sel"]] = case["scalar"]
+
+    store_b, arr_b = build()
+    n = case["sel"].stop - case["sel"].start
+    arr_b[case["sel"]] = np.full(n, case["scalar"], dtype="uint8")
+
+    keys_a = {k: bytes(v.to_bytes()) for k, v in store_a._store_dict.items()}
+    keys_b = {k: bytes(v.to_bytes()) for k, v in store_b._store_dict.items()}
+    assert keys_a == keys_b
+
+
+# ---------------------------------------------------------------------------
+# get_ranges_sync coalescing: merged fetches must return exactly what
+# individual per-range gets return, for any gap/coalesce limits.
+# ---------------------------------------------------------------------------
+
+
+@st.composite
+def _range_cases(draw: st.DrawFn) -> dict[str, Any]:
+    blob_len = draw(st.integers(1, 200))
+    n = draw(st.integers(1, 8))
+    ranges: list[RangeByteRequest | OffsetByteRequest | SuffixByteRequest | None] = []
+    for _ in range(n):
+        kind = draw(st.sampled_from(["range", "offset", "suffix", "none"]))
+        if kind == "range":
+            start = draw(st.integers(0, blob_len - 1))
+            end = draw(st.integers(start + 1, blob_len))
+            ranges.append(RangeByteRequest(start, end))
+        elif kind == "offset":
+            ranges.append(OffsetByteRequest(draw(st.integers(0, blob_len - 1))))
+        elif kind == "suffix":
+            ranges.append(SuffixByteRequest(draw(st.integers(1, blob_len))))
+        else:
+            ranges.append(None)
+    return {
+        "blob": draw(st.binary(min_size=blob_len, max_size=blob_len)),
+        "ranges": ranges,
+        "max_gap": draw(st.integers(0, 64)),
+        "max_coalesced": draw(st.integers(1, 512)),
+    }
+
+
+@settings(max_examples=200, deadline=None)
+@given(case=_range_cases())
+def test_get_ranges_sync_equals_individual_gets(case: dict[str, Any]) -> None:
+    """Coalesced byte-range reads must return exactly what one get_sync per
+    range returns — for any gap/coalesce limits (the offset re-slicing math is
+    where a coalescing bug would corrupt data)."""
+    store = MemoryStore()
+    store._is_open = True
+    proto = default_buffer_prototype()
+    store._store_dict["k"] = CPUBuffer.from_bytes(case["blob"])
+
+    expected = [store.get_sync("k", prototype=proto, byte_range=r) for r in case["ranges"]]
+
+    got: dict[int, bytes | None] = {}
+    for idx, buf in store.get_ranges_sync(
+        "k",
+        case["ranges"],
+        prototype=proto,
+        max_gap_bytes=case["max_gap"],
+        max_coalesced_bytes=case["max_coalesced"],
+    ):
+        got[idx] = None if buf is None else bytes(buf.to_bytes())
+
+    for i, exp in enumerate(expected):
+        exp_bytes = None if exp is None else bytes(exp.to_bytes())
+        assert got.get(i) == exp_bytes, f"range {i} ({case['ranges'][i]!r}) mismatch"
diff --git a/tests/test_fused_pipeline.py b/tests/test_fused_pipeline.py
new file mode 100644
index 0000000000..db3a1b1f89
--- /dev/null
+++ b/tests/test_fused_pipeline.py
@@ -0,0 +1,784 @@
+"""Tests for FusedCodecPipeline -- the per-chunk-fused codec pipeline."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import numpy as np
+import pytest
+
+import zarr
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.transpose import TransposeCodec
+from zarr.codecs.zstd import ZstdCodec
+from zarr.core.codec_pipeline import FusedCodecPipeline
+from zarr.core.config import config as zarr_config
+from zarr.storage import MemoryStore, StorePath
+
+
+@pytest.mark.parametrize(
+    "codecs",
+    [
+        (BytesCodec(),),
+        (BytesCodec(), GzipCodec(level=1)),
+        (BytesCodec(), ZstdCodec(level=1)),
+        (TransposeCodec(order=(1, 0)), BytesCodec()),
+        (TransposeCodec(order=(1, 0)), BytesCodec(), ZstdCodec(level=1)),
+    ],
+    ids=["bytes-only", "gzip", "zstd", "transpose", "transpose+zstd"],
+)
+def test_construction(codecs: tuple[Any, ...]) -> None:
+    """FusedCodecPipeline can be constructed from valid codec combinations."""
+    pipeline = FusedCodecPipeline.from_codecs(codecs)
+    assert pipeline.codecs == codecs
+
+
+def test_evolve_from_array_spec() -> None:
+    """evolve_from_array_spec creates a sync transform."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
+    assert pipeline.sync_transform is None
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(100,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+    evolved = pipeline.evolve_from_array_spec(spec)
+    assert evolved.sync_transform is not None
+
+
+# ---------------------------------------------------------------------------
+# Sync path tests
+#
+# These exercise FusedCodecPipeline's synchronous API (write_sync / read_sync /
+# sync_transform), which has no equivalent on BatchedCodecPipeline -- so they
+# cannot live in the pipeline-agnostic CodecPipelineTests suite. The async
+# roundtrip / fill-value behaviour is covered there (test_scenario) across both
+# pipelines and sync/async stores.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    ("dtype", "shape"),
+    [
+        ("float64", (100,)),
+        ("float32", (50,)),
+        ("int32", (200,)),
+        ("float64", (10, 10)),
+    ],
+    ids=["f64-1d", "f32-1d", "i32-1d", "f64-2d"],
+)
+def test_read_write_sync_roundtrip(dtype: str, shape: tuple[int, ...]) -> None:
+    """Data written via write_sync can be read back via read_sync."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype(dtype))
+    spec = ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    data = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape)
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_selection = tuple(slice(0, s) for s in shape)
+    out_selection = chunk_selection
+    store_path = StorePath(store, "c/0")
+
+    # Write sync
+    pipeline.write_sync(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        value,
+    )
+
+    # Read sync
+    out = CPUNDBuffer.from_numpy_array(np.zeros(shape, dtype=dtype))
+    pipeline.read_sync(
+        [(store_path, spec, chunk_selection, out_selection, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(data, out.as_numpy_array())
+
+
+def test_read_sync_missing_chunk_fills() -> None:
+    """Sync read of a missing chunk fills with the fill value."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(10,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(42.0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    out = CPUNDBuffer.from_numpy_array(np.zeros(10, dtype="float64"))
+    store_path = StorePath(store, "c/0")
+    chunk_sel = (slice(0, 10),)
+
+    pipeline.read_sync(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        out,
+    )
+
+    np.testing.assert_array_equal(out.as_numpy_array(), np.full(10, 42.0))
+
+
+def test_sync_write_async_read_roundtrip() -> None:
+    """Data written via write_sync can be read back via async read."""
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+    from zarr.core.sync import sync
+
+    store = MemoryStore()
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+    spec = ArraySpec(
+        shape=(100,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+
+    pipeline = FusedCodecPipeline.from_codecs((BytesCodec(),))
+    pipeline = pipeline.evolve_from_array_spec(spec)
+
+    data = np.arange(100, dtype="float64")
+    value = CPUNDBuffer.from_numpy_array(data)
+    chunk_sel = (slice(0, 100),)
+    store_path = StorePath(store, "c/0")
+
+    # Write sync
+    pipeline.write_sync(
+        [(store_path, spec, chunk_sel, chunk_sel, True)],
+        value,
+    )
+
+    # Read async
+    out = CPUNDBuffer.from_numpy_array(np.zeros(100, dtype="float64"))
+    sync(
+        pipeline.read(
+            [(store_path, spec, chunk_sel, chunk_sel, True)],
+            out,
+        )
+    )
+
+
+# --- byte-range-write fast-path tests: disabled ---
+# The sharding codec's byte-range-write fast path (set_range_sync) was removed
+# from this PR pending a decision on the store interface; partial shard writes
+# now always take the full-shard-rewrite path. These tests are known-good and
+# kept commented out to restore once the store byte-range-write design lands.
+# def test_partial_shard_write_uses_set_range() -> None:
+#     """Partial shard writes with fixed-size codecs should use set_range_sync.
+#
+#     Only the FusedCodecPipeline uses byte-range writes for partial shard
+#     updates; skipped under other pipelines.
+#     """
+#     from unittest.mock import patch
+#
+#     store = zarr.storage.MemoryStore()
+#     # write_empty_chunks=True keeps a fixed-size dense layout, which is
+#     # required for the byte-range fast path (chunks never transition
+#     # present <-> absent).
+#     arr = zarr.create_array(
+#         store=store,
+#         shape=(100,),
+#         dtype="float64",
+#         chunks=(10,),
+#         shards=(100,),
+#         compressors=None,
+#         fill_value=0.0,
+#         config={"write_empty_chunks": True},
+#     )
+#     if not isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline):
+#         pytest.skip("byte-range write optimization is specific to FusedCodecPipeline")
+#
+#     # Initial full write to create the shard blob
+#     arr[:] = np.arange(100, dtype="float64")
+#
+#     # Partial write — should use set_range_sync, not set_sync
+#     with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock_set_range:
+#         arr[5] = 999.0
+#
+#     # set_range_sync should be called: once for the chunk data, once for the index
+#     assert mock_set_range.call_count >= 1, (
+#         "Expected set_range_sync to be called for partial shard write"
+#     )
+#
+#     # Verify correctness
+#     expected = np.arange(100, dtype="float64")
+#     expected[5] = 999.0
+#     np.testing.assert_array_equal(arr[:], expected)
+#
+#
+# def test_partial_shard_write_falls_back_for_compressed() -> None:
+#     """Partial shard writes with compressed inner codecs should NOT use set_range.
+#
+#     Only meaningful under FusedCodecPipeline (which can use byte-range writes
+#     for fixed-size inner codecs). Other pipelines never use set_range_sync,
+#     so the assertion is trivially true and the test is uninformative.
+#     """
+#     from unittest.mock import patch
+#
+#     store = zarr.storage.MemoryStore()
+#     arr = zarr.create_array(
+#         store=store,
+#         shape=(100,),
+#         dtype="float64",
+#         chunks=(10,),
+#         shards=(100,),
+#         compressors=GzipCodec(),
+#         fill_value=0.0,
+#     )
+#     if not isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline):
+#         pytest.skip("byte-range write optimization is specific to FusedCodecPipeline")
+#     arr[:] = np.arange(100, dtype="float64")
+#
+#     with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock_set_range:
+#         arr[5] = 999.0
+#
+#     # With compression, set_range_sync should NOT be used
+#     assert mock_set_range.call_count == 0, (
+#         "set_range_sync should not be used with compressed inner codecs"
+#     )
+#
+#     expected = np.arange(100, dtype="float64")
+#     expected[5] = 999.0
+#     np.testing.assert_array_equal(arr[:], expected)
+#
+#
+# def test_partial_shard_write_skips_set_range_when_write_empty_chunks_false() -> None:
+#     """The byte-range fast path must NOT fire under the default write_empty_chunks=False.
+#
+#     The fast path assumes a fixed, dense shard layout. With empty-chunk skipping
+#     (the default) a chunk can transition present<->absent, so an in-place
+#     byte-range overwrite would corrupt the layout. The complement of
+#     test_partial_shard_write_uses_set_range (which uses write_empty_chunks=True).
+#     """
+#     from unittest.mock import patch
+#
+#     store = zarr.storage.MemoryStore()
+#     arr = zarr.create_array(
+#         store=store,
+#         shape=(100,),
+#         dtype="float64",
+#         chunks=(10,),
+#         shards=(100,),
+#         compressors=None,
+#         fill_value=0.0,
+#         # default config: write_empty_chunks=False
+#     )
+#     if not isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline):
+#         pytest.skip("byte-range write optimization is specific to FusedCodecPipeline")
+#     arr[:] = np.arange(100, dtype="float64")
+#
+#     with patch.object(type(store), "set_range_sync", wraps=store.set_range_sync) as mock_set_range:
+#         arr[5] = 999.0
+#
+#     assert mock_set_range.call_count == 0, (
+#         "byte-range fast path was taken with write_empty_chunks=False; "
+#         "this would produce a dense layout incompatible with empty-chunk skipping"
+#     )
+#
+#     expected = np.arange(100, dtype="float64")
+#     expected[5] = 999.0
+#     np.testing.assert_array_equal(arr[:], expected)
+#
+#
+# def test_partial_shard_write_handles_readonly_store_buffers(tmp_path: Path) -> None:
+#     """The byte-range path decodes the shard index from a store buffer and mutates
+#     it; LocalStore returns read-only buffers, so the path must copy before writing.
+#
+#     Without the copy, the partial write raises
+#     ``ValueError: assignment destination is read-only``. Fused-only because only
+#     the Fused byte-range path decodes+mutates a shard index in place.
+#     """
+#     store = zarr.storage.LocalStore(tmp_path / "data.zarr")
+#     arr = zarr.create_array(
+#         store=store,
+#         shape=(16,),
+#         chunks=(4,),
+#         shards=(8,),
+#         dtype="float64",
+#         compressors=None,
+#         fill_value=0.0,
+#         config={"write_empty_chunks": True},
+#     )
+#     if not isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline):
+#         pytest.skip("byte-range write optimization is specific to FusedCodecPipeline")
+#     arr[:] = np.arange(16, dtype="float64")
+#     arr[2] = 42.0  # triggers the byte-range path against a read-only store buffer
+#     assert arr[2] == 42.0
+
+
+def test_chunk_transform_uses_runtime_prototype() -> None:
+    """ChunkTransform must pass each codec the prototype from the runtime chunk_spec,
+    not one captured at evolve time. Constructs ChunkTransform directly (a
+    Fused-internal data structure with no BatchedCodecPipeline equivalent).
+    """
+    from zarr.abc.codec import BytesBytesCodec
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype
+    from zarr.core.codec_pipeline import ChunkTransform
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    class _PrototypeRecordingCodec(BytesBytesCodec):  # type: ignore[misc,unused-ignore]
+        """A no-op BB codec that records the prototype it was called with."""
+
+        is_fixed_size = True
+        seen_prototypes: list[object]
+
+        def __init__(self) -> None:
+            object.__setattr__(self, "seen_prototypes", [])
+
+        def to_dict(self) -> dict[str, Any]:
+            return {"name": "_prototype_recording", "configuration": {}}
+
+        @classmethod
+        def from_dict(cls, data: dict[str, Any]) -> _PrototypeRecordingCodec:
+            return cls()
+
+        def compute_encoded_size(self, input_byte_length: int, _spec: ArraySpec) -> int:
+            return input_byte_length
+
+        def _encode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None:
+            self.seen_prototypes.append(chunk_spec.prototype)
+            return chunk_bytes
+
+        def _decode_sync(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer:
+            self.seen_prototypes.append(chunk_spec.prototype)
+            return chunk_bytes
+
+        async def _encode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer | None:
+            return self._encode_sync(chunk_bytes, chunk_spec)
+
+        async def _decode_single(self, chunk_bytes: Buffer, chunk_spec: ArraySpec) -> Buffer:
+            return self._decode_sync(chunk_bytes, chunk_spec)
+
+    recording = _PrototypeRecordingCodec()
+    transform = ChunkTransform(codecs=(BytesCodec(), recording))
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("float64"))
+
+    def _spec(prototype: BufferPrototype) -> ArraySpec:
+        return ArraySpec(
+            shape=(10,),
+            dtype=zdtype,
+            fill_value=zdtype.cast_scalar(0.0),
+            config=ArrayConfig(order="C", write_empty_chunks=False),
+            prototype=prototype,
+        )
+
+    proto_default = default_buffer_prototype()
+    # A distinct BufferPrototype instance with the same buffer/nd_buffer types --
+    # fails an identity check but works at runtime.
+    proto_other = BufferPrototype(buffer=proto_default.buffer, nd_buffer=proto_default.nd_buffer)
+    assert proto_other is not proto_default
+
+    arr = proto_default.nd_buffer.from_numpy_array(np.arange(10, dtype="float64"))
+    transform.encode_chunk(arr, _spec(proto_default))
+    transform.encode_chunk(arr, _spec(proto_other))
+
+    assert recording.seen_prototypes[0] is proto_default
+    assert recording.seen_prototypes[1] is proto_other, (
+        "ChunkTransform did not pass the runtime prototype to the codec"
+    )
+
+
+# ---------------------------------------------------------------------------
+# Thread-pool (max_workers > 1) tests
+#
+# The pool dispatch in read_sync/write_sync is Fused-only and off by default
+# (codec_pipeline.max_workers defaults to 1 == sequential). These tests opt in
+# and exercise the pool path end-to-end, exception propagation from workers,
+# and concurrent decode through the shared ChunkTransform.
+# ---------------------------------------------------------------------------
+
+_FUSED_POOL_CONFIG = {
+    "codec_pipeline.path": "zarr.core.codec_pipeline.FusedCodecPipeline",
+    "codec_pipeline.max_workers": 4,
+}
+
+
+def test_read_write_with_thread_pool() -> None:
+    """With max_workers > 1, multi-chunk reads and writes dispatch through the
+    thread pool (pool.map in read_sync/write_sync) and produce the same results
+    as sequential execution.
+
+    The `_get_pool` spy pins that the pool branch actually fires: without it,
+    a config-resolution regression (renamed key, `_resolve_max_workers`
+    returning 1) would silently degrade all the pool tests into re-testing the
+    sequential branch while staying green.
+    """
+    from unittest.mock import patch
+
+    import zarr.core.codec_pipeline as cp_mod
+
+    with zarr_config.set(_FUSED_POOL_CONFIG):
+        assert cp_mod._resolve_max_workers() == 4, (
+            "codec_pipeline.max_workers config did not reach _resolve_max_workers"
+        )
+        store = MemoryStore()
+        arr = zarr.create_array(
+            store=store,
+            shape=(100,),
+            chunks=(10,),
+            dtype="float64",
+            compressors=None,
+            fill_value=0.0,
+        )
+        assert isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline)
+        data = np.arange(100, dtype="float64")
+        with patch.object(cp_mod, "_get_pool", wraps=cp_mod._get_pool) as pool_spy:
+            arr[:] = data  # 10 chunks -> pool dispatch in write_sync
+            assert pool_spy.call_count >= 1, "multi-chunk write did not take the pool branch"
+            writes = pool_spy.call_count
+            np.testing.assert_array_equal(arr[:], data)  # pool dispatch in read_sync
+            assert pool_spy.call_count > writes, "multi-chunk read did not take the pool branch"
+        arr[5:25] = 7.0  # partial write: merge path through the pool
+        data[5:25] = 7.0
+        np.testing.assert_array_equal(arr[:], data)
+
+
+def test_thread_pool_write_worker_exception_propagates() -> None:
+    """A store error raised inside a pool worker during write_sync surfaces to
+    the caller (write_sync consumes pool.map, so worker exceptions re-raise)."""
+    from unittest.mock import patch
+
+    with zarr_config.set(_FUSED_POOL_CONFIG):
+        store = MemoryStore()
+        arr = zarr.create_array(
+            store=store,
+            shape=(100,),
+            chunks=(10,),
+            dtype="float64",
+            compressors=None,
+            fill_value=0.0,
+        )
+        with (
+            patch.object(store, "set_sync", side_effect=RuntimeError("simulated store error")),
+            pytest.raises(RuntimeError, match="simulated store error"),
+        ):
+            arr[:] = np.arange(100, dtype="float64")
+
+
+def test_thread_pool_read_worker_exception_propagates() -> None:
+    """A store error raised inside a pool worker during read_sync surfaces to
+    the caller (read_sync consumes pool.map into a tuple)."""
+    from unittest.mock import patch
+
+    with zarr_config.set(_FUSED_POOL_CONFIG):
+        store = MemoryStore()
+        arr = zarr.create_array(
+            store=store,
+            shape=(100,),
+            chunks=(10,),
+            dtype="float64",
+            compressors=None,
+            fill_value=0.0,
+        )
+        arr[:] = np.arange(100, dtype="float64")
+        with (
+            patch.object(store, "get_sync", side_effect=RuntimeError("simulated store error")),
+            pytest.raises(RuntimeError, match="simulated store error"),
+        ):
+            arr[:]
+
+
+def test_concurrent_reads_shared_transform_with_pool() -> None:
+    """Concurrent decode through the shared ChunkTransform produces correct data.
+
+    The transform's `_resolve_specs` cache is shared mutable state. With no
+    array->array codecs the cache is bypassed entirely, so this uses a transpose
+    filter to force cache traffic, max_workers=4 so pool workers decode chunks
+    concurrently, and an outer thread pool so multiple reads are in flight at
+    once. Each round RE-OPENS the array so the shared transform starts with a
+    cold spec cache and the concurrent readers race the non-atomic first fill —
+    reading through a single pre-warmed handle would only ever exercise cache
+    hits. This pins correctness under concurrency (it cannot prove the absence
+    of a race, but a torn cache would corrupt results here).
+    """
+    from concurrent.futures import ThreadPoolExecutor
+
+    with zarr_config.set(_FUSED_POOL_CONFIG):
+        store = MemoryStore()
+        arr = zarr.create_array(
+            store=store,
+            shape=(40, 40),
+            chunks=(5, 5),
+            dtype="int32",
+            filters=[TransposeCodec(order=(1, 0))],
+            serializer=BytesCodec(),
+            compressors=None,
+            fill_value=-1,
+        )
+        data = np.arange(1600, dtype="int32").reshape(40, 40)
+        arr[:] = data
+
+        for _ in range(5):  # several rounds, each racing a cold cache
+            fresh = zarr.open_array(store=store, mode="r")
+
+            def read_row_block(i: int, handle: zarr.Array[Any] = fresh) -> np.ndarray:
+                return np.asarray(handle[i * 4 : (i + 1) * 4, :])
+
+            with ThreadPoolExecutor(max_workers=8) as ex:
+                futures = {ex.submit(read_row_block, i): i for i in range(10)}
+                for fut, i in futures.items():
+                    np.testing.assert_array_equal(fut.result(), data[i * 4 : (i + 1) * 4, :])
+
+
+def test_sharded_fallback_inner_chunks_avoid_async_transform() -> None:
+    """Inner chunks of a shard on a NON-sync store decode through the sync
+    ChunkTransform, not per-chunk AsyncChunkTransform coroutines.
+
+    The sharding byte getters are in-memory dict wrappers; they implement
+    SyncByteGetter/SyncByteSetter, and the nested inner pipeline is evolved
+    (so its sync transform exists), letting the nested read/write take the
+    sync fast path. Without this, every inner chunk pays a coroutine for a
+    dict lookup plus an async per-chunk transform — measured at 1.5x (raw) to
+    3.6x (gzip) of sharded fallback read time.
+    """
+    from unittest.mock import patch
+
+    from zarr.core.codec_pipeline import AsyncChunkTransform
+    from zarr.testing.store import LatencyStore
+
+    calls = {"decode": 0, "encode": 0}
+    orig_decode = AsyncChunkTransform.decode_chunk
+    orig_encode = AsyncChunkTransform.encode_chunk
+
+    async def spy_decode(self: Any, *args: Any, **kwargs: Any) -> Any:
+        calls["decode"] += 1
+        return await orig_decode(self, *args, **kwargs)
+
+    async def spy_encode(self: Any, *args: Any, **kwargs: Any) -> Any:
+        calls["encode"] += 1
+        return await orig_encode(self, *args, **kwargs)
+
+    # LatencyStore is not sync-capable -> the OUTER pipeline takes the async
+    # fallback; the INNER chunks go over the sharding byte getters.
+    store = LatencyStore(MemoryStore(), get_latency=0.0, set_latency=0.0)
+    arr = zarr.create_array(
+        store=store,
+        shape=(100,),
+        chunks=(10,),
+        shards=(50,),
+        dtype="uint8",
+        compressors=None,
+        fill_value=0,
+    )
+    if not isinstance(arr._async_array.codec_pipeline, FusedCodecPipeline):
+        pytest.skip("sync fast path for inner chunks is specific to FusedCodecPipeline")
+
+    data = np.arange(100, dtype="uint8")
+    sync_calls = {"read_sync": 0}
+    orig_read_sync = FusedCodecPipeline.read_sync
+
+    def spy_read_sync(self: Any, *args: Any, **kwargs: Any) -> Any:
+        sync_calls["read_sync"] += 1
+        return orig_read_sync(self, *args, **kwargs)
+
+    with (
+        patch.object(AsyncChunkTransform, "decode_chunk", spy_decode),
+        patch.object(AsyncChunkTransform, "encode_chunk", spy_encode),
+        patch.object(FusedCodecPipeline, "read_sync", spy_read_sync),
+    ):
+        arr[:] = data
+        out = np.asarray(arr[:])
+
+    np.testing.assert_array_equal(out, data)
+    assert calls == {"decode": 0, "encode": 0}, (
+        f"inner chunks went through per-chunk AsyncChunkTransform coroutines: {calls}"
+    )
+    # The outer store is not sync-capable, so any read_sync calls are the
+    # NESTED pipeline taking the sync fast path over the sharding byte getters
+    # (the SyncByteGetter gate). Without the gate, inner chunks go through
+    # concurrent_map with one coroutine per chunk. (Writes don't appear here:
+    # the fallback write encodes whole shards through the outer sync transform
+    # -> ShardingCodec._encode_sync, never touching the nested byte setters.)
+    assert sync_calls["read_sync"] >= 1, "nested read did not take the sync fast path"
+
+
+def test_write_over_sync_byte_setter_takes_sync_path() -> None:
+    """`FusedCodecPipeline.write` routes a non-StorePath `SyncByteSetter` (the
+    sharding codec's `_ShardingByteSetter`) through `write_sync`.
+
+    This is the write-side twin of the SyncByteGetter gate: the read test
+    above cannot guard it because fallback whole-array writes encode shards
+    via `_encode_sync` and never touch the nested byte setters. The nested
+    `write` over `_ShardingByteSetter` is reached from the async shard encode
+    paths (`_encode_single`/`_encode_partial_single`), so pin the gate
+    directly: without it, this write degrades to the async fallback (one
+    coroutine per inner chunk for an in-memory dict store).
+    """
+    import asyncio
+    from unittest.mock import patch
+
+    from zarr.codecs.sharding import _ShardingByteSetter
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("uint8"))
+    spec = ArraySpec(
+        shape=(10,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+    pipeline = FusedCodecPipeline.from_codecs([BytesCodec()]).evolve_from_array_spec(spec)
+    assert pipeline.sync_transform is not None
+
+    shard_dict: dict[tuple[int, ...], Any] = {}
+    setter = _ShardingByteSetter(shard_dict, (0,))
+    value = default_buffer_prototype().nd_buffer.from_numpy_array(np.arange(10, dtype="uint8"))
+
+    sync_calls = {"write_sync": 0}
+    orig_write_sync = FusedCodecPipeline.write_sync
+
+    def spy_write_sync(self: Any, *args: Any, **kwargs: Any) -> Any:
+        sync_calls["write_sync"] += 1
+        return orig_write_sync(self, *args, **kwargs)
+
+    sel = (slice(0, 10),)
+    with patch.object(FusedCodecPipeline, "write_sync", spy_write_sync):
+        asyncio.run(pipeline.write([(setter, spec, sel, sel, True)], value))
+
+    assert sync_calls["write_sync"] >= 1, (
+        "write over a SyncByteSetter did not take the sync fast path"
+    )
+    written = shard_dict[(0,)]
+    np.testing.assert_array_equal(
+        np.frombuffer(written.to_bytes(), dtype="uint8"), np.arange(10, dtype="uint8")
+    )
+
+
+# ---------------------------------------------------------------------------
+# AsyncChunkTransform: the async per-chunk codec chain used on the async
+# fallback path. It is the async mirror of ChunkTransform, so it must produce
+# identical bytes/arrays. The default (Fused, sync-store) path never uses it;
+# these tests drive it directly over multi-codec chains so the aa/bb loops and
+# the all-fill drop branch are exercised.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "codecs",
+    [
+        (BytesCodec(),),
+        (BytesCodec(), GzipCodec(level=1)),
+        (TransposeCodec(order=(1, 0)), BytesCodec()),
+        (TransposeCodec(order=(1, 0)), BytesCodec(), ZstdCodec(level=1)),
+    ],
+    ids=["bytes-only", "bb", "aa", "aa+ab+bb"],
+)
+def test_async_chunk_transform_matches_sync(codecs: tuple[Any, ...]) -> None:
+    """`AsyncChunkTransform.decode_chunk`/`encode_chunk` must round-trip and
+    produce exactly what the synchronous `ChunkTransform` produces, across
+    array->array, array->bytes, and bytes->bytes codec combinations.
+
+    This is the async mirror of the codecs the default pipeline runs
+    synchronously; a divergence here corrupts data only on the async fallback
+    path (remote stores), which no end-to-end test of the default pipeline
+    touches.
+    """
+    import asyncio
+
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.codec_pipeline import AsyncChunkTransform, ChunkTransform, evolve_codecs
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    shape = (4, 4)
+    zdtype = get_data_type_from_native_dtype(np.dtype("int32"))
+    spec = ArraySpec(
+        shape=shape,
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+    evolved = evolve_codecs(codecs, spec)
+    sync_t = ChunkTransform(codecs=evolved)
+    async_t = AsyncChunkTransform(codecs=evolved)
+
+    data = np.arange(16, dtype="int32").reshape(shape)
+    value = CPUNDBuffer.from_numpy_array(data)
+
+    sync_bytes = sync_t.encode_chunk(value, spec)
+    async_bytes = asyncio.run(async_t.encode_chunk(value, spec))
+    assert sync_bytes is not None
+    assert async_bytes is not None
+    np.testing.assert_array_equal(async_bytes.to_bytes(), sync_bytes.to_bytes())
+
+    sync_arr = sync_t.decode_chunk(async_bytes, spec)
+    async_arr = asyncio.run(async_t.decode_chunk(async_bytes, spec))
+    np.testing.assert_array_equal(async_arr.as_numpy_array(), sync_arr.as_numpy_array())
+    np.testing.assert_array_equal(async_arr.as_numpy_array(), data)
+
+
+def test_async_decode_encode_passes_through_none_chunks() -> None:
+    """`FusedCodecPipeline.decode`/`encode` (the async batch entry points used
+    on the fallback path) map a None chunk to None and leave real chunks
+    untouched — pins the None-passthrough branch the default sync path skips."""
+    import asyncio
+
+    from zarr.core.array_spec import ArrayConfig, ArraySpec
+    from zarr.core.buffer import default_buffer_prototype
+    from zarr.core.buffer.cpu import NDBuffer as CPUNDBuffer
+    from zarr.core.dtype import get_data_type_from_native_dtype
+
+    zdtype = get_data_type_from_native_dtype(np.dtype("int32"))
+    spec = ArraySpec(
+        shape=(4,),
+        dtype=zdtype,
+        fill_value=zdtype.cast_scalar(0),
+        config=ArrayConfig(order="C", write_empty_chunks=True),
+        prototype=default_buffer_prototype(),
+    )
+    pipeline = FusedCodecPipeline.from_codecs([BytesCodec()]).evolve_from_array_spec(spec)
+
+    data = np.arange(4, dtype="int32")
+    value = CPUNDBuffer.from_numpy_array(data)
+
+    # encode a real chunk and a None chunk together
+    encoded = list(asyncio.run(pipeline.encode([(value, spec), (None, spec)])))
+    assert encoded[1] is None
+    assert encoded[0] is not None
+
+    # decode the real chunk and a None chunk together
+    decoded = list(asyncio.run(pipeline.decode([(encoded[0], spec), (None, spec)])))
+    assert decoded[1] is None
+    assert decoded[0] is not None
+    np.testing.assert_array_equal(decoded[0].as_numpy_array(), data)
diff --git a/tests/test_pipeline_parity.py b/tests/test_pipeline_parity.py
new file mode 100644
index 0000000000..49bf25af69
--- /dev/null
+++ b/tests/test_pipeline_parity.py
@@ -0,0 +1,415 @@
+"""Pipeline parity test — exhaustive matrix of read/write scenarios.
+
+For every cell of the matrix (codec config x layout x operation
+sequence x runtime config), assert that ``FusedCodecPipeline`` and
+``BatchedCodecPipeline`` produce semantically identical results:
+
+  * Same returned array contents on read.
+  * Same set of store keys after writes (catches divergent empty-shard
+    handling: one pipeline deletes, the other writes an empty blob).
+  * Reading each pipeline's store contents through the *other* pipeline
+    yields the same array (catches "wrote a layout that only one
+    pipeline can read" bugs).
+
+Pipeline-divergence bugs (e.g. one pipeline writes a dense shard
+layout while the other writes a compact layout) fail this test
+loudly with a clear diff, instead of waiting for a downstream
+test to trip over the symptom.
+
+Byte-for-byte equality of store contents is intentionally NOT
+checked: codecs like gzip embed the wall-clock timestamp in their
+output, so two compressions of the same data done at different
+seconds produce different bytes despite being semantically
+identical.
+
+The matrix axes are:
+
+  * codec chain — bytes-only, gzip, with/without sharding
+  * layout — chunk_shape, shard_shape (None for no sharding)
+  * write sequence — full overwrite, partial in middle, scalar to one
+    cell, multiple overlapping writes, sequence ending in fill values
+  * runtime config — write_empty_chunks True/False
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import pytest
+
+import zarr
+from zarr.codecs.bytes import BytesCodec
+from zarr.codecs.crc32c_ import Crc32cCodec
+from zarr.codecs.gzip import GzipCodec
+from zarr.codecs.sharding import SUBCHUNK_WRITE_ORDER, ShardingCodec, SubchunkWriteOrder
+from zarr.core.config import config as zarr_config
+from zarr.storage import MemoryStore
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Iterator
+
+
+# ---------------------------------------------------------------------------
+# Reference helpers
+# ---------------------------------------------------------------------------
+
+
+def _store_snapshot(store: MemoryStore) -> dict[str, bytes]:
+    """Return {key: bytes} for every entry in the store."""
+    return {k: bytes(v.to_bytes()) for k, v in store._store_dict.items()}
+
+
+# ---------------------------------------------------------------------------
+# Matrix definitions
+# ---------------------------------------------------------------------------
+
+
+# Each codec config is (filters, serializer, compressors). We only vary the
+# pieces that actually affect the pipeline. compressors=None means a
+# fixed-size chain (the byte-range fast path is eligible when sharded).
+CodecConfig = dict[str, Any]
+
+CODEC_CONFIGS: list[tuple[str, CodecConfig]] = [
+    ("bytes-only", {"compressors": None}),
+    ("gzip", {"compressors": GzipCodec(level=1)}),
+    # Big-endian serializer: the on-disk byte order is carried by the BytesCodec,
+    # not the dtype. Guards the bulk whole-shard decode against ignoring endian
+    # (it would otherwise reinterpret big-endian bytes as native — silent
+    # corruption). dtype is int32 so endianness is observable.
+    (
+        "bytes-big-endian",
+        {"compressors": None, "serializer": BytesCodec(endian="big"), "dtype": "int32"},
+    ),
+    # crc32c as a bytes->bytes codec after the serializer: the bulk fast path
+    # must NOT silently drop checksum verification (it falls through to the
+    # per-chunk path). Parity still requires identical bytes + contents across
+    # pipelines. (crc32c is a BytesBytesCodec, so it goes in `compressors`.)
+    (
+        "bytes-crc32c",
+        {"compressors": [Crc32cCodec()], "serializer": BytesCodec(), "dtype": "int32"},
+    ),
+]
+
+
+# (id, kwargs) — chunks/shards layout. kwargs are passed to create_array.
+LayoutConfig = dict[str, Any]
+
+LAYOUT_CONFIGS: list[tuple[str, LayoutConfig]] = [
+    ("1d-unsharded", {"shape": (100,), "chunks": (10,), "shards": None}),
+    ("1d-1chunk-per-shard", {"shape": (100,), "chunks": (10,), "shards": (10,)}),
+    ("1d-multi-chunk-per-shard", {"shape": (100,), "chunks": (10,), "shards": (50,)}),
+    ("2d-unsharded", {"shape": (20, 20), "chunks": (5, 5), "shards": None}),
+    ("2d-sharded", {"shape": (20, 20), "chunks": (5, 5), "shards": (10, 10)}),
+    # Nested sharding: outer chunk (10,10) sharded into inner chunks (5,5).
+    # Restricted to bytes-only codec because combining an outer ShardingCodec
+    # with a compressor (gzip) triggers a ZarrUserWarning and results in a
+    # checksum mismatch inside the inner shard index — a known limitation, not
+    # a pipeline-parity bug.  The bytes-only path still exercises the full
+    # two-level shard encoding/decoding in both pipelines.
+    (
+        "2d-nested-sharded",
+        {
+            "shape": (20, 20),
+            "chunks": (10, 10),
+            "shards": None,
+            "serializer": ShardingCodec(
+                chunk_shape=(10, 10),
+                codecs=[ShardingCodec(chunk_shape=(5, 5))],
+            ),
+            # Only run with the bytes-only codec config; gzip is incompatible
+            # with nested sharding (see comment above).
+            "_codec_ids": {"bytes-only"},
+        },
+    ),
+]
+
+
+WriteOp = tuple[Any, Any]  # (selection, value)
+WriteSequence = tuple[str, list[WriteOp]]
+
+
+def _full_overwrite(shape: tuple[int, ...]) -> list[WriteOp]:
+    return [((slice(None),) * len(shape), np.arange(int(np.prod(shape))).reshape(shape) + 1)]
+
+
+def _partial_middle(shape: tuple[int, ...]) -> list[WriteOp]:
+    if len(shape) == 1:
+        n = shape[0]
+        return [((slice(n // 4, 3 * n // 4),), 7)]
+    # 2D: write a centered block
+    rs = slice(shape[0] // 4, 3 * shape[0] // 4)
+    cs = slice(shape[1] // 4, 3 * shape[1] // 4)
+    return [((rs, cs), 7)]
+
+
+def _scalar_one_cell(shape: tuple[int, ...]) -> list[WriteOp]:
+    if len(shape) == 1:
+        return [((shape[0] // 2,), 99)]
+    return [((shape[0] // 2, shape[1] // 2), 99)]
+
+
+def _overlapping(shape: tuple[int, ...]) -> list[WriteOp]:
+    if len(shape) == 1:
+        n = shape[0]
+        return [
+            ((slice(0, n // 2),), 1),
+            ((slice(n // 4, 3 * n // 4),), 2),
+            ((slice(n // 2, n),), 3),
+        ]
+    rs1, cs1 = slice(0, shape[0] // 2), slice(0, shape[1] // 2)
+    rs2, cs2 = slice(shape[0] // 4, 3 * shape[0] // 4), slice(shape[1] // 4, 3 * shape[1] // 4)
+    return [((rs1, cs1), 1), ((rs2, cs2), 2)]
+
+
+def _ends_in_fill(shape: tuple[int, ...]) -> list[WriteOp]:
+    """Write something then overwrite it with fill — exercises empty-chunk handling."""
+    full = (slice(None),) * len(shape)
+    return [(full, 5), (full, 0)]
+
+
+def _ends_in_partial_fill(shape: tuple[int, ...]) -> list[WriteOp]:
+    """Write data, then overwrite half with fill — some chunks become empty."""
+    full: tuple[slice, ...]
+    half: tuple[slice, ...]
+    if len(shape) == 1:
+        full = (slice(None),)
+        half = (slice(0, shape[0] // 2),)
+    else:
+        full = (slice(None), slice(None))
+        half = (slice(0, shape[0] // 2), slice(None))
+    return [(full, 5), (half, 0)]
+
+
+SEQUENCES: list[tuple[str, Callable[[tuple[int, ...]], list[WriteOp]]]] = [
+    ("full-overwrite", _full_overwrite),
+    ("partial-middle", _partial_middle),
+    ("scalar-one-cell", _scalar_one_cell),
+    ("overlapping", _overlapping),
+    ("ends-in-fill", _ends_in_fill),
+    ("ends-in-partial-fill", _ends_in_partial_fill),
+]
+
+
+WRITE_EMPTY_CHUNKS = [False, True]
+
+
+# ---------------------------------------------------------------------------
+# Matrix iteration (pruned)
+# ---------------------------------------------------------------------------
+
+
+def _matrix() -> Iterator[Any]:
+    for codec_id, codec_kwargs in CODEC_CONFIGS:
+        for layout_id, layout in LAYOUT_CONFIGS:
+            allowed = layout.get("_codec_ids")
+            if allowed is not None and codec_id not in allowed:
+                continue
+            for seq_id, seq_fn in SEQUENCES:
+                for wec in WRITE_EMPTY_CHUNKS:
+                    yield pytest.param(
+                        codec_kwargs,
+                        layout,
+                        seq_fn,
+                        wec,
+                        id=f"{layout_id}-{codec_id}-{seq_id}-wec{wec}",
+                    )
+
+
+# ---------------------------------------------------------------------------
+# The parity test
+# ---------------------------------------------------------------------------
+
+
+def _write_under_pipeline(
+    pipeline_path: str,
+    codec_kwargs: CodecConfig,
+    layout: LayoutConfig,
+    sequence: list[WriteOp],
+    write_empty_chunks: bool,
+) -> tuple[MemoryStore, Any]:
+    """Apply a sequence of writes via the chosen pipeline.
+
+    Returns (store with the written data, final array contents read back).
+    """
+    # Strip private metadata keys (e.g. "_codec_ids") before passing to create_array.
+    array_layout = {k: v for k, v in layout.items() if not k.startswith("_")}
+    # dtype defaults to float64 but a codec config may override it (e.g. an
+    # endian-sensitive int dtype). Merge so the override wins without a dup kwarg.
+    create_kwargs = {"dtype": "float64", **array_layout, **codec_kwargs}
+    store = MemoryStore()
+    with zarr_config.set({"codec_pipeline.path": pipeline_path}):
+        arr = zarr.create_array(
+            store=store,
+            fill_value=0,
+            config={"write_empty_chunks": write_empty_chunks},
+            **create_kwargs,
+        )
+        for sel, val in sequence:
+            arr[sel] = val
+        contents = arr[...]
+    return store, contents
+
+
+def _read_under_pipeline(pipeline_path: str, store: MemoryStore) -> Any:
+    """Re-open an existing store under the chosen pipeline and read it whole."""
+    with zarr_config.set({"codec_pipeline.path": pipeline_path}):
+        arr = zarr.open_array(store=store, mode="r")
+        return arr[...]
+
+
+_BATCHED = "zarr.core.codec_pipeline.BatchedCodecPipeline"
+_FUSED = "zarr.core.codec_pipeline.FusedCodecPipeline"
+
+
+@pytest.mark.parametrize(
+    ("codec_kwargs", "layout", "sequence_fn", "write_empty_chunks"),
+    list(_matrix()),
+)
+def test_pipeline_parity(
+    codec_kwargs: CodecConfig,
+    layout: LayoutConfig,
+    sequence_fn: Callable[[tuple[int, ...]], list[WriteOp]],
+    write_empty_chunks: bool,
+) -> None:
+    """FusedCodecPipeline must be semantically identical to BatchedCodecPipeline.
+
+    Three checks, in order of decreasing diagnostic value:
+
+      1. Both pipelines return the same array contents after the same
+         write sequence (catches semantic correctness bugs).
+      2. Both pipelines produce the same set of store keys (catches
+         empty-shard divergence: one deletes, the other doesn't).
+      3. Each pipeline can correctly read the *other* pipeline's
+         output (catches layout-divergence bugs that would prevent
+         interop, e.g. dense vs compact shard layouts).
+
+    Byte-for-byte store equality is intentionally not checked: codecs
+    like gzip embed wall-clock timestamps that vary between runs.
+    """
+    sequence = sequence_fn(layout["shape"])
+
+    batched_store, batched_arr = _write_under_pipeline(
+        _BATCHED, codec_kwargs, layout, sequence, write_empty_chunks
+    )
+    sync_store, sync_arr = _write_under_pipeline(
+        _FUSED, codec_kwargs, layout, sequence, write_empty_chunks
+    )
+
+    # 1. Array contents must agree.
+    np.testing.assert_array_equal(
+        sync_arr,
+        batched_arr,
+        err_msg="FusedCodecPipeline returned different array contents than BatchedCodecPipeline",
+    )
+
+    # 2. Store key sets must agree.
+    batched_keys = set(batched_store._store_dict) - {"zarr.json"}
+    sync_keys = set(sync_store._store_dict) - {"zarr.json"}
+    assert sync_keys == batched_keys, (
+        f"Pipelines disagree on which store keys exist.\n"
+        f"  only in batched: {sorted(batched_keys - sync_keys)}\n"
+        f"  only in sync:    {sorted(sync_keys - batched_keys)}"
+    )
+
+    # 3. Cross-read: each pipeline must correctly read the other's output.
+    sync_reads_batched = _read_under_pipeline(_FUSED, batched_store)
+    batched_reads_sync = _read_under_pipeline(_BATCHED, sync_store)
+    np.testing.assert_array_equal(
+        sync_reads_batched,
+        batched_arr,
+        err_msg="FusedCodecPipeline could not correctly read BatchedCodecPipeline's output",
+    )
+    np.testing.assert_array_equal(
+        batched_reads_sync,
+        sync_arr,
+        err_msg="BatchedCodecPipeline could not correctly read FusedCodecPipeline's output",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Partial-read parity across subchunk write orders
+# ---------------------------------------------------------------------------
+#
+# Note: general partial-read coverage (scalar single-element and strided reads
+# from sharded arrays, which hit the sharding codec's partial-decode path) lives
+# in tests/test_codec_pipeline_suite.py as Scenarios. Those run each pipeline
+# against a numpy reference -- strictly stronger than checking the two pipelines
+# only against each other, and they cover both the sync (_decode_partial_sync)
+# and async (_decode_partial_single) partial-decode variants. What remains here
+# is the cross-pipeline byte-identical-layout check, which the per-pipeline
+# suite structurally cannot express.
+
+
+@pytest.mark.parametrize("subchunk_write_order", SUBCHUNK_WRITE_ORDER)
+@pytest.mark.parametrize("index_location", ["start", "end"])
+def test_pipeline_parity_subchunk_write_order(
+    subchunk_write_order: SubchunkWriteOrder, index_location: str
+) -> None:
+    """Both pipelines must agree across every subchunk_write_order, including a
+    PARTIAL write into an already-dense fixed-size shard.
+
+    This is the regression net for the byte-range write fast path, which derives
+    each chunk's physical slot from its rank in subchunk_write_order. A wrong
+    (e.g. hardcoded morton) assumption corrupts non-default orders silently, so
+    we assert both identical contents AND identical stored bytes across pipelines.
+    write_empty_chunks=True keeps every slot present, making the shard dense and
+    the byte-range write path eligible.
+    """
+    # 2D, fixed-size (no compression). The shard (array `chunks`) must hold
+    # MULTIPLE inner chunks, and be non-square, so morton / lexicographic /
+    # colexicographic produce physically DIFFERENT layouts — with one inner
+    # chunk per shard all orders coincide and a wrong-order bug is invisible.
+    # inner chunk = (2, 2); shard = (6, 4) -> a 3x2 grid of inner chunks.
+    shape, shard_shape, inner_chunk = (12, 8), (6, 4), (2, 2)
+    serializer = ShardingCodec(
+        chunk_shape=inner_chunk,
+        codecs=[BytesCodec()],
+        index_location=index_location,
+        subchunk_write_order=subchunk_write_order,
+    )
+    ref = np.arange(int(np.prod(shape)), dtype="int32").reshape(shape)
+
+    def run(pipeline_path: str) -> tuple[dict[str, bytes], Any]:
+        store = MemoryStore()
+        with zarr_config.set({"codec_pipeline.path": pipeline_path}):
+            arr = zarr.create_array(
+                store=store,
+                shape=shape,
+                chunks=shard_shape,  # array "chunks" == shard size for a ShardingCodec serializer
+                dtype="int32",
+                fill_value=-1,
+                serializer=serializer,
+                compressors=None,
+                config={"write_empty_chunks": True},
+            )
+            arr[:] = ref  # dense full write
+            arr[3:9, 1:6] = 777  # partial write INTO the dense shard
+            contents = arr[...]
+        return _store_snapshot(store), contents
+
+    batched_bytes, batched_contents = run(_BATCHED)
+    sync_bytes, sync_contents = run(_FUSED)
+
+    # Contents must always match across pipelines and equal the reference —
+    # this catches a wrong-order byte-range write (it corrupts the data).
+    expected = ref.copy()
+    expected[3:9, 1:6] = 777
+    np.testing.assert_array_equal(batched_contents, expected)
+    np.testing.assert_array_equal(
+        sync_contents,
+        batched_contents,
+        err_msg=f"pipeline contents diverged for subchunk_write_order={subchunk_write_order!r}",
+    )
+    # The two pipelines must also produce byte-identical shards — a stronger
+    # check that they agree on physical layout. This holds for EVERY order
+    # (without special-casing any by name): both pipelines lay chunks out via
+    # the same `_subchunk_order_iter`, so for a given codec instance they must
+    # land on the same bytes whatever that order resolves to. We make no
+    # assumption here about what any particular order "means" — only that the
+    # two implementations agree.
+    assert sync_bytes == batched_bytes, (
+        f"pipelines wrote different bytes for subchunk_write_order={subchunk_write_order!r} "
+        f"(index_location={index_location!r}) — byte-range write fast path likely assumed "
+        f"the wrong physical chunk order"
+    )
diff --git a/tests/test_store/test_get_ranges.py b/tests/test_store/test_get_ranges.py
index 8f0c6a4814..f04251adf4 100644
--- a/tests/test_store/test_get_ranges.py
+++ b/tests/test_store/test_get_ranges.py
@@ -18,6 +18,7 @@
 from zarr.core.buffer import default_buffer_prototype
 from zarr.storage import MemoryStore
 from zarr.storage._wrapper import WrapperStore
+from zarr.testing.store import LatencyStore
 
 if TYPE_CHECKING:
     from collections.abc import AsyncIterator, Sequence
@@ -58,6 +59,46 @@ async def test_memory_store_get_ranges_missing_key_raises() -> None:
         await anext(agen)
 
 
+def test_get_ranges_sync_reads_multiple_ranges() -> None:
+    """The synchronous `get_ranges_sync` on a sync-capable store returns each
+    requested range, mirroring the async `get_ranges` happy path."""
+    import asyncio
+
+    store = MemoryStore()
+    blob = bytes(i % 256 for i in range(512))
+    asyncio.run(_write(store, "blob", blob))
+
+    ranges = [RangeByteRequest(0, 10), RangeByteRequest(100, 110)]
+    proto = default_buffer_prototype()
+    flat: dict[int, bytes] = {}
+    for idx, buf in store.get_ranges_sync("blob", ranges, prototype=proto):
+        assert buf is not None
+        flat[idx] = buf.to_bytes()
+
+    assert flat[0] == blob[0:10]
+    assert flat[1] == blob[100:110]
+
+
+def test_get_ranges_sync_missing_key_raises() -> None:
+    """A missing key makes `get_ranges_sync` raise a BaseExceptionGroup
+    containing FileNotFoundError — the same contract as async `get_ranges`, so
+    callers handle a deleted shard uniformly across sync and async paths."""
+    store = MemoryStore()
+    proto = default_buffer_prototype()
+    with pytest.RaisesGroup(pytest.RaisesExc(FileNotFoundError)):
+        store.get_ranges_sync("does-not-exist", [RangeByteRequest(0, 10)], prototype=proto)
+
+
+def test_get_ranges_sync_on_non_sync_store_raises_type_error() -> None:
+    """`get_ranges_sync` requires the store to support synchronous reads
+    (`SupportsGetSync`); a non-sync store raises TypeError rather than silently
+    falling back."""
+    store = LatencyStore(MemoryStore(), get_latency=0.0, set_latency=0.0)
+    proto = default_buffer_prototype()
+    with pytest.raises(TypeError, match="does not support synchronous reads"):
+        store.get_ranges_sync("k", [RangeByteRequest(0, 10)], prototype=proto)
+
+
 async def test_wrapper_store_delegates_get_ranges() -> None:
     """WrapperStore.get_ranges must delegate to the wrapped store, not fall back to the default."""
 
diff --git a/tests/test_store/test_local.py b/tests/test_store/test_local.py
index 6756bc83d9..72b30e0ac2 100644
--- a/tests/test_store/test_local.py
+++ b/tests/test_store/test_local.py
@@ -8,6 +8,10 @@
 
 import zarr
 from zarr import create_array
+
+# SupportsSetRange import disabled with the byte-range-write tests below
+# (removed from this PR pending a store-interface decision).
+# from zarr.abc.store import SupportsSetRange
 from zarr.core.buffer import Buffer, cpu
 from zarr.storage import LocalStore
 from zarr.storage._local import _atomic_write
@@ -108,6 +112,58 @@ async def test_move(
         ):
             await store2.move(destination)
 
+    # --- byte-range-write tests: disabled ---
+    # Byte-range-write support (set_range / set_range_sync / SupportsSetRange)
+    # was removed from this PR pending a decision on the store interface. These
+    # tests are known-good and kept commented out to restore once that lands.
+    # def test_supports_set_range(self, store: LocalStore) -> None:
+    #     """LocalStore should implement SupportsSetRange."""
+    #     assert isinstance(store, SupportsSetRange)
+    #
+    # @pytest.mark.parametrize(
+    #     ("start", "patch", "expected"),
+    #     [
+    #         (0, b"XX", b"XXAAAAAAAA"),
+    #         (3, b"XX", b"AAAXXAAAAA"),
+    #         (8, b"XX", b"AAAAAAAAXX"),
+    #         (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+    #         (5, b"B", b"AAAAABAAAA"),
+    #         (0, b"BCDE", b"BCDEAAAAAA"),
+    #     ],
+    #     ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    # )
+    # async def test_set_range(
+    #     self, store: LocalStore, start: int, patch: bytes, expected: bytes
+    # ) -> None:
+    #     """set_range should overwrite bytes at the given offset."""
+    #     await store.set("test/key", cpu.Buffer.from_bytes(b"AAAAAAAAAA"))
+    #     await store.set_range("test/key", cpu.Buffer.from_bytes(patch), start=start)
+    #     result = await store.get("test/key", prototype=cpu.buffer_prototype)
+    #     assert result is not None
+    #     assert result.to_bytes() == expected
+    #
+    # @pytest.mark.parametrize(
+    #     ("start", "patch", "expected"),
+    #     [
+    #         (0, b"XX", b"XXAAAAAAAA"),
+    #         (3, b"XX", b"AAAXXAAAAA"),
+    #         (8, b"XX", b"AAAAAAAAXX"),
+    #         (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+    #         (5, b"B", b"AAAAABAAAA"),
+    #         (0, b"BCDE", b"BCDEAAAAAA"),
+    #     ],
+    #     ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    # )
+    # def test_set_range_sync(
+    #     self, store: LocalStore, start: int, patch: bytes, expected: bytes
+    # ) -> None:
+    #     """set_range_sync should overwrite bytes at the given offset."""
+    #     sync(store.set("test/key", cpu.Buffer.from_bytes(b"AAAAAAAAAA")))
+    #     store.set_range_sync("test/key", cpu.Buffer.from_bytes(patch), start=start)
+    #     result = store.get_sync(key="test/key", prototype=cpu.buffer_prototype)
+    #     assert result is not None
+    #     assert result.to_bytes() == expected
+
 
 @pytest.mark.parametrize("exclusive", [True, False])
 def test_atomic_write_successful(tmp_path: pathlib.Path, exclusive: bool) -> None:
diff --git a/tests/test_store/test_memory.py b/tests/test_store/test_memory.py
index 1e3ee89e92..a3c0509519 100644
--- a/tests/test_store/test_memory.py
+++ b/tests/test_store/test_memory.py
@@ -8,6 +8,10 @@
 import pytest
 
 import zarr
+
+# SupportsSetRange import disabled with the byte-range-write tests below
+# (removed from this PR pending a store-interface decision).
+# from zarr.abc.store import SupportsSetRange
 from zarr.core.buffer import Buffer, cpu, gpu
 from zarr.errors import ZarrUserWarning
 from zarr.storage import GpuMemoryStore, ManagedMemoryStore, MemoryStore
@@ -76,6 +80,59 @@ async def test_deterministic_size(
         np.testing.assert_array_equal(a[:3], 1)
         np.testing.assert_array_equal(a[3:], 0)
 
+    # --- byte-range-write tests: disabled ---
+    # Byte-range-write support (set_range / set_range_sync / SupportsSetRange)
+    # was removed from this PR pending a decision on the store interface. These
+    # tests are known-good and kept commented out to restore once that lands.
+    # def test_supports_set_range(self, store: MemoryStore) -> None:
+    #     """MemoryStore should implement SupportsSetRange."""
+    #     assert isinstance(store, SupportsSetRange)
+    #
+    # @pytest.mark.parametrize(
+    #     ("start", "patch", "expected"),
+    #     [
+    #         (0, b"XX", b"XXAAAAAAAA"),
+    #         (3, b"XX", b"AAAXXAAAAA"),
+    #         (8, b"XX", b"AAAAAAAAXX"),
+    #         (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+    #         (5, b"B", b"AAAAABAAAA"),
+    #         (0, b"BCDE", b"BCDEAAAAAA"),
+    #     ],
+    #     ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    # )
+    # async def test_set_range(
+    #     self, store: MemoryStore, start: int, patch: bytes, expected: bytes
+    # ) -> None:
+    #     """set_range should overwrite bytes at the given offset."""
+    #     await store.set("test/key", cpu.Buffer.from_bytes(b"AAAAAAAAAA"))
+    #     await store.set_range("test/key", cpu.Buffer.from_bytes(patch), start=start)
+    #     result = await store.get("test/key", prototype=cpu.buffer_prototype)
+    #     assert result is not None
+    #     assert result.to_bytes() == expected
+    #
+    # @pytest.mark.parametrize(
+    #     ("start", "patch", "expected"),
+    #     [
+    #         (0, b"XX", b"XXAAAAAAAA"),
+    #         (3, b"XX", b"AAAXXAAAAA"),
+    #         (8, b"XX", b"AAAAAAAAXX"),
+    #         (0, b"ZZZZZZZZZZ", b"ZZZZZZZZZZ"),
+    #         (5, b"B", b"AAAAABAAAA"),
+    #         (0, b"BCDE", b"BCDEAAAAAA"),
+    #     ],
+    #     ids=["start", "middle", "end", "full-overwrite", "single-byte", "multi-byte-start"],
+    # )
+    # def test_set_range_sync(
+    #     self, store: MemoryStore, start: int, patch: bytes, expected: bytes
+    # ) -> None:
+    #     """set_range_sync should overwrite bytes at the given offset."""
+    #     store._is_open = True
+    #     store._store_dict["test/key"] = cpu.Buffer.from_bytes(b"AAAAAAAAAA")
+    #     store.set_range_sync("test/key", cpu.Buffer.from_bytes(patch), start=start)
+    #     result = store.get_sync(key="test/key", prototype=cpu.buffer_prototype)
+    #     assert result is not None
+    #     assert result.to_bytes() == expected
+
 
 # TODO: fix this warning
 @pytest.mark.filterwarnings("ignore:Unclosed client session:ResourceWarning")