From 90ab6e19a937c6ef895d61aaa43f96e4db193d93 Mon Sep 17 00:00:00 2001
From: Mark Harfouche <mark.harfouche@gmail.com>
Date: Thu, 18 Dec 2025 15:01:45 -0500
Subject: [PATCH 1/4] Cache the result of the zero-check

Closes https://github.com/zarr-developers/zarr-python/issues/3627
---
 changes/3628.misc.md            | 1 +
 src/zarr/core/codec_pipeline.py | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 changes/3628.misc.md

diff --git a/changes/3628.misc.md b/changes/3628.misc.md
new file mode 100644
index 0000000000..0aa706e5cd
--- /dev/null
+++ b/changes/3628.misc.md
@@ -0,0 +1 @@
+Avoid reading lazy arrays or on device arrays twice when comparing them to 0 during the writing process.
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index fd557ac43e..e77526e5b8 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -413,6 +413,12 @@ async def _read_key(
                 if chunk_array is None:
                     chunk_array_batch.append(None)  # type: ignore[unreachable]
                 else:
+                    # The operation array_equal operation below effectively will force the array
+                    # into memory.
+                    # if the result is useful, we want to avoid reading it twice
+                    # from a potentially lazy operation. So we cache it here.
+                    # If the result is not useful, we leave it for the garbage collector.
+                    chunk_array._data = chunk_array.as_numpy_array()
                     if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
                         fill_value_or_default(chunk_spec)
                     ):

From afb8fffc4ddfb376001099013e937f2ffd4fc47a Mon Sep 17 00:00:00 2001
From: Mark Harfouche <mark.harfouche@gmail.com>
Date: Thu, 1 Jan 2026 18:37:22 -0500
Subject: [PATCH 2/4] Update for improved GPU support

---
 src/zarr/core/codec_pipeline.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index e77526e5b8..53d0c5aa09 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -5,6 +5,8 @@
 from typing import TYPE_CHECKING, Any, TypeVar
 from warnings import warn
 
+import numpy as np
+
 from zarr.abc.codec import (
     ArrayArrayCodec,
     ArrayBytesCodec,
@@ -19,6 +21,7 @@
 from zarr.core.indexing import SelectorTuple, is_scalar
 from zarr.errors import ZarrUserWarning
 from zarr.registry import register_pipeline
+from zarr.core.buffer import NDBuffer
 
 if TYPE_CHECKING:
     from collections.abc import Iterable, Iterator
@@ -26,7 +29,7 @@
 
     from zarr.abc.store import ByteGetter, ByteSetter
     from zarr.core.array_spec import ArraySpec
-    from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer
+    from zarr.core.buffer import Buffer, BufferPrototype
     from zarr.core.chunk_grids import ChunkGrid
     from zarr.core.dtype.wrapper import TBaseDType, TBaseScalar, ZDType
 
@@ -413,18 +416,21 @@ async def _read_key(
                 if chunk_array is None:
                     chunk_array_batch.append(None)  # type: ignore[unreachable]
                 else:
-                    # The operation array_equal operation below effectively will force the array
-                    # into memory.
-                    # if the result is useful, we want to avoid reading it twice
-                    # from a potentially lazy operation. So we cache it here.
-                    # If the result is not useful, we leave it for the garbage collector.
-                    chunk_array._data = chunk_array.as_numpy_array()
-                    if not chunk_spec.config.write_empty_chunks and chunk_array.all_equal(
-                        fill_value_or_default(chunk_spec)
-                    ):
-                        chunk_array_batch.append(None)
-                    else:
-                        chunk_array_batch.append(chunk_array)
+                    if not chunk_spec.config.write_empty_chunks:
+                        # The operation array_equal operation below effectively will force the array
+                        # into memory.
+                        # if the result is useful, we want to avoid reading it twice
+                        # from a potentially lazy operation. So we cache it here.
+                        # If the result is not useful, we leave it for the garbage collector.
+                        # We optimize this operation for the case that the GPU
+                        if not hasattr(chunk_array._data, '__cuda_array_interface__'):
+                            chunk_array = NDBuffer(np.asarray(chunk_array._data))
+
+                        if chunk_array.all_equal(
+                            fill_value_or_default(chunk_spec)
+                        ):
+                            chunk_array = None
+                    chunk_array_batch.append(chunk_array)
 
             chunk_bytes_batch = await self.encode_batch(
                 [

From db37fed6ef8630e5f8002f8acdc8323eea9e433e Mon Sep 17 00:00:00 2001
From: Mark Harfouche <mark.harfouche@gmail.com>
Date: Fri, 2 Jan 2026 08:16:16 -0500
Subject: [PATCH 3/4] try again

---
 src/zarr/core/codec_pipeline.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index 53d0c5aa09..f612105dd3 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -416,7 +416,9 @@ async def _read_key(
                 if chunk_array is None:
                     chunk_array_batch.append(None)  # type: ignore[unreachable]
                 else:
-                    if not chunk_spec.config.write_empty_chunks:
+                    if chunk_spec.config.write_empty_chunks:
+                        chunk_array_batch.append(chunk_array)
+                    else:
                         # The operation array_equal operation below effectively will force the array
                         # into memory.
                         # if the result is useful, we want to avoid reading it twice
@@ -429,8 +431,9 @@ async def _read_key(
                         if chunk_array.all_equal(
                             fill_value_or_default(chunk_spec)
                         ):
-                            chunk_array = None
-                    chunk_array_batch.append(chunk_array)
+                            chunk_array_batch.append(None)
+                        else:
+                            chunk_array_batch.append(chunk_array)
 
             chunk_bytes_batch = await self.encode_batch(
                 [

From eeedcccea263c28cc824b98f07d25c541a9efb90 Mon Sep 17 00:00:00 2001
From: Mark Harfouche <mark.harfouche@gmail.com>
Date: Fri, 2 Jan 2026 09:01:31 -0500
Subject: [PATCH 4/4] hack

---
 src/zarr/core/codec_pipeline.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
index f612105dd3..a9f1cb8938 100644
--- a/src/zarr/core/codec_pipeline.py
+++ b/src/zarr/core/codec_pipeline.py
@@ -426,7 +426,11 @@ async def _read_key(
                         # If the result is not useful, we leave it for the garbage collector.
                         # We optimize this operation for the case that the GPU
                         if not hasattr(chunk_array._data, '__cuda_array_interface__'):
-                            chunk_array = NDBuffer(np.asarray(chunk_array._data))
+                            # I'm not sure why this implementation doesn't work
+                            # it seems like something is getting missed by me
+                            # chunk_array = NDBuffer(np.asarray(chunk_array._data))
+                            # This line here just feels more dirty
+                            chunk_array._data = np.asarray(chunk_array._data)
 
                         if chunk_array.all_equal(
                             fill_value_or_default(chunk_spec)