From 22a58073719189333a4369170794504b42fefd6b Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 13 Jun 2024 17:03:42 -0700
Subject: [PATCH 01/46] Initial implementation of a GPU version of Buffer and
 NDBuffer

---
 src/zarr/buffer.py | 291 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 291 insertions(+)

diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
index 86f9b53477..cac2d2f204 100644
--- a/src/zarr/buffer.py
+++ b/src/zarr/buffer.py
@@ -23,6 +23,11 @@
     from zarr.codecs.bytes import Endian
     from zarr.common import BytesLike
 
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
 
 @runtime_checkable
 class ArrayLike(Protocol):
@@ -461,6 +466,291 @@ def as_numpy_array_wrapper(
     return prototype.buffer.from_bytes(func(buf.as_numpy_array()))
 
 
+class GpuBuffer(Buffer):
+    """A flat contiguous memory block on the GPU
+
+    We use Buffer throughout Zarr to represent a contiguous block of memory.
+
+    A Buffer is backed by a underlying array-like instance that represents
+    the memory. The memory type is unspecified; can be regular host memory,
+    CUDA device memory, or something else. The only requirement is that the
+    array-like instance can be copied/converted to a regular Numpy array
+    (host memory).
+
+    Note
+    ----
+    This buffer is untyped, so all indexing and sizes are in bytes.
+
+    Parameters
+    ----------
+    array_like
+        array-like object that must be 1-dim, contiguous, and byte dtype.
+    """
+
+    def __init__(self, array_like: ArrayLike):
+        if cp is None:
+            raise RuntimeError("Cannot use GpuBuffer without cupy")
+
+        if array_like.ndim != 1:
+            raise ValueError("array_like: only 1-dim allowed")
+        if array_like.dtype != np.dtype("b"):
+            raise ValueError("array_like: only byte dtype allowed")
+
+        # if type(array_like) == cp.ndarray
+        #     self._data = array_like
+        if hasattr(array_like, "__cuda_array_interface__"):
+            self._data = cp.asarray(array_like)
+        else:
+            #raise ArgumentError("GpuBuffer only supports inputs that implement the '__cuda_array_interface__' protocol")
+            # Slow copy based path for arrays that don't support the __cuda_array_interface__
+            # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
+            buffer = Buffer(array_like)
+            self._data = cp.asarray(buffer.as_numpy_array())
+
+    @classmethod
+    def create_zero_length(cls) -> Self:
+        """Create an empty buffer with length zero
+
+        Returns
+        -------
+            New empty 0-length buffer
+        """
+        return cls(cp.array([], dtype="b"))
+
+    @classmethod
+    def from_buffer(cls, buffer: Buffer) -> Self:
+        """Create an GpuBuffer given an arbitrary Buffer
+
+        Returns
+        -------
+            New empty 0-length buffer
+        """
+        return cls(buffer.as_array_like())
+
+    @classmethod
+    def from_array_like(cls, array_like: ArrayLike) -> Self:
+        """Create a new buffer of a array-like object
+
+        Parameters
+        ----------
+        array_like
+            array-like object that must be 1-dim, contiguous, and byte dtype.
+
+        Returns
+        -------
+            New buffer representing `array_like`
+        """
+        return cls(array_like)
+
+    @classmethod
+    def from_bytes(cls, bytes_like: BytesLike) -> Self:
+        """Create a new buffer of a bytes-like object (host memory)
+
+        Parameters
+        ----------
+        bytes_like
+           bytes-like object
+
+        Returns
+        -------
+            New buffer representing `bytes_like`
+        """
+        return cls.from_array_like(cp.frombuffer(bytes_like, dtype="b"))
+
+    def as_numpy_array(self) -> npt.NDArray[Any]:
+        """Returns the buffer as a NumPy array (host memory).
+
+        Warning
+        -------
+        Might have to copy data, consider using `.as_array_like()` instead.
+
+        Returns
+        -------
+            NumPy array of this buffer (might be a data copy)
+        """
+        return cp.asnumpy(self._data)
+
+    def __add__(self, other: Buffer) -> Self:
+        """Concatenate two buffers"""
+
+        other_array = other.as_array_like()
+        assert other_array.dtype == np.dtype("b")
+        gpu_other = GpuBuffer(other_array)
+        gpu_other_array = gpu_other.as_array_like()
+        return self.__class__(
+            cp.concatenate((cp.asanyarray(self._data), cp.asanyarray(gpu_other_array)))
+        )
+
+
+class GpuNDBuffer(NDBuffer):
+    """A n-dimensional memory block on the GPU
+
+    We use NDBuffer throughout Zarr to represent a n-dimensional memory block.
+
+    A NDBuffer is backed by a underlying ndarray-like instance that represents
+    the memory. The memory type is unspecified; can be regular host memory,
+    CUDA device memory, or something else. The only requirement is that the
+    ndarray-like instance can be copied/converted to a regular Numpy array
+    (host memory).
+
+    Note
+    ----
+    The two buffer classes Buffer and NDBuffer are very similar. In fact, Buffer
+    is a special case of NDBuffer where dim=1, stride=1, and dtype="b". However,
+    in order to use Python's type system to differentiate between the contiguous
+    Buffer and the n-dim (non-contiguous) NDBuffer, we keep the definition of the
+    two classes separate.
+
+    Parameters
+    ----------
+    ndarray_like
+        ndarray-like object that is convertible to a regular Numpy array.
+    """
+
+    def __init__(self, array: NDArrayLike):
+        if cp is None:
+            raise RuntimeError("Cannot use GpuNDBuffer without cupy")
+
+        # assert array.ndim > 0
+        assert array.dtype != object
+        self._data = array
+
+        # if isinstance(array_like, cp.ndarray):
+        #     self._data = array_like
+        if hasattr(array, "__cuda_array_interface__"):
+            self._data = cp.asarray(array)
+        else:
+            # Slow copy based path for arrays that don't support the __cuda_array_interface__
+            # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
+            nd_buffer = NDBuffer(array)
+            self._data = cp.asarray(nd_buffer.as_numpy_array())
+
+    @classmethod
+    def create(
+        cls,
+        *,
+        shape: Iterable[int],
+        dtype: npt.DTypeLike,
+        order: Literal["C", "F"] = "C",
+        fill_value: Any | None = None,
+    ) -> Self:
+        """Create a new buffer and its underlying ndarray-like object
+
+        Parameters
+        ----------
+        shape
+            The shape of the buffer and its underlying ndarray-like object
+        dtype
+            The datatype of the buffer and its underlying ndarray-like object
+        order
+            Whether to store multi-dimensional data in row-major (C-style) or
+            column-major (Fortran-style) order in memory.
+        fill_value
+            If not None, fill the new buffer with a scalar value.
+
+        Returns
+        -------
+            New buffer representing a new ndarray_like object
+
+        Developer Notes
+        ---------------
+        A subclass can overwrite this method to create a ndarray-like object
+        other then the default Numpy array.
+        """
+        ret = cls(cp.empty(shape=tuple(shape), dtype=dtype, order=order))
+        if fill_value is not None:
+            ret.fill(fill_value)
+        return ret
+
+    @classmethod
+    def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self:
+        """Create a new buffer of a ndarray-like object
+
+        Parameters
+        ----------
+        ndarray_like
+            ndarray-like object
+
+        Returns
+        -------
+            New buffer representing `ndarray_like`
+        """
+        return cls(ndarray_like)
+
+    @classmethod
+    def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
+        """Create a new buffer of Numpy array-like object
+
+        Parameters
+        ----------
+        array_like
+            Object that can be coerced into a Numpy array
+
+        Returns
+        -------
+            New buffer representing `array_like`
+        """
+        return cls(cp.asarray(array_like))
+
+    def as_ndarray_like(self) -> NDArrayLike:
+        """Returns the underlying array (host or device memory) of this buffer
+
+        This will never copy data.
+
+        Returns
+        -------
+            The underlying array such as a NumPy or CuPy array.
+        """
+        return self._data
+
+    def as_numpy_array(self) -> npt.NDArray[Any]:
+        """Returns the buffer as a NumPy array (host memory).
+
+        Warning
+        -------
+        Might have to copy data, consider using `.as_ndarray_like()` instead.
+
+        Returns
+        -------
+            NumPy array of this buffer (might be a data copy)
+        """
+        return cp.asnumpy(self._data)
+
+    @property
+    def dtype(self) -> np.dtype[Any]:
+        return self._data.dtype
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        return self._data.shape
+
+    @property
+    def byteorder(self) -> Endian:
+        from zarr.codecs.bytes import Endian
+
+        if self.dtype.byteorder == "<":
+            return Endian.little
+        elif self.dtype.byteorder == ">":
+            return Endian.big
+        else:
+            return Endian(sys.byteorder)
+
+    def __getitem__(self, key: Any) -> Self:
+        return self.__class__(self._data.__getitem__(key))
+
+    def __setitem__(self, key: Any, value: Any) -> None:
+        if isinstance(value, GpuNDBuffer):
+            value = value._data
+        elif isinstance(value, NDBuffer):
+            gpu_value = GpuNDBuffer(value.as_ndarray_like())
+            value = gpu_value._data
+        self._data.__setitem__(key, value)
+
+    def transpose(self, axes: SupportsIndex | Sequence[SupportsIndex] | None) -> Self:
+        return self.__class__(self._data.transpose(axes))
+
+
+
 class BufferPrototype(NamedTuple):
     """Prototype of the Buffer and NDBuffer class
 
@@ -480,3 +770,4 @@ class BufferPrototype(NamedTuple):
 
 # The default buffer prototype used throughout the Zarr codebase.
 default_buffer_prototype = BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
+gpu_buffer_prototype = BufferPrototype(buffer=GpuBuffer, nd_buffer=GpuNDBuffer)

From d8cc79f29f1aa5c3440990b5f86ede6bc2b12f36 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 13 Jun 2024 20:22:57 -0700
Subject: [PATCH 02/46] Adding cupy as an optional dependency

---
 pyproject.toml     | 3 +++
 src/zarr/buffer.py | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f1be6725b6..288291aea4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,6 +60,9 @@ jupyter = [
     'ipytree>=0.2.2',
     'ipywidgets>=8.0.0',
 ]
+gpu = [
+    "cupy-cuda12x>=13.0.0",
+]
 docs = [
     'sphinx',
     'sphinx-autobuild>=2021.3.14',
diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
index cac2d2f204..dbfa0f04eb 100644
--- a/src/zarr/buffer.py
+++ b/src/zarr/buffer.py
@@ -501,7 +501,7 @@ def __init__(self, array_like: ArrayLike):
         if hasattr(array_like, "__cuda_array_interface__"):
             self._data = cp.asarray(array_like)
         else:
-            #raise ArgumentError("GpuBuffer only supports inputs that implement the '__cuda_array_interface__' protocol")
+            # raise ArgumentError("GpuBuffer only supports inputs that implement the '__cuda_array_interface__' protocol")
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
             # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
             buffer = Buffer(array_like)
@@ -750,7 +750,6 @@ def transpose(self, axes: SupportsIndex | Sequence[SupportsIndex] | None) -> Sel
         return self.__class__(self._data.transpose(axes))
 
 
-
 class BufferPrototype(NamedTuple):
     """Prototype of the Buffer and NDBuffer class
 

From 4d2b8c7ca6f4a6e2a5b3bb35337b3c14229019f6 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 13 Jun 2024 21:11:42 -0700
Subject: [PATCH 03/46] Adding GPU prototype test

---
 pyproject.toml          |  2 +-
 tests/v3/test_buffer.py | 39 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 288291aea4..16745cb5b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,7 +61,7 @@ jupyter = [
     'ipywidgets>=8.0.0',
 ]
 gpu = [
-    "cupy-cuda12x>=13.0.0",
+    "cupy>=13.0.0",
 ]
 docs = [
     'sphinx',
diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py
index 77e1b6b688..9b89237dfa 100644
--- a/tests/v3/test_buffer.py
+++ b/tests/v3/test_buffer.py
@@ -8,7 +8,14 @@
 import pytest
 
 from zarr.array import AsyncArray
-from zarr.buffer import ArrayLike, Buffer, BufferPrototype, NDArrayLike, NDBuffer
+from zarr.buffer import (
+    ArrayLike,
+    Buffer,
+    BufferPrototype,
+    NDArrayLike,
+    NDBuffer,
+    gpu_buffer_prototype,
+)
 from zarr.codecs.blosc import BloscCodec
 from zarr.codecs.bytes import BytesCodec
 from zarr.codecs.crc32c_ import Crc32cCodec
@@ -21,6 +28,11 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
 
 class MyNDArrayLike(np.ndarray):
     """An example of a ndarray-like class"""
@@ -106,6 +118,31 @@ async def test_async_array_prototype():
     assert np.array_equal(expect, got)
 
 
+@pytest.mark.skipif(cp is None, reason="requires cupy")
+@pytest.mark.asyncio
+async def test_async_array_gpu_prototype():
+    """Test the use of the GPU buffer prototype"""
+
+    expect = cp.zeros((9, 9), dtype="uint16", order="F")
+    a = await AsyncArray.create(
+        StorePath(MemoryStore(mode="w")) / "test_async_array_gpu_prototype",
+        shape=expect.shape,
+        chunk_shape=(5, 5),
+        dtype=expect.dtype,
+        fill_value=0,
+    )
+    expect[1:4, 3:6] = cp.ones((3, 3))
+
+    await a.setitem(
+        selection=(slice(1, 4), slice(3, 6)),
+        value=cp.ones((3, 3)),
+        prototype=gpu_buffer_prototype,
+    )
+    got = await a.getitem(selection=(slice(0, 9), slice(0, 9)), prototype=gpu_buffer_prototype)
+    assert isinstance(got, cp.ndarray)
+    assert cp.array_equal(expect, got)
+
+
 @pytest.mark.asyncio
 async def test_codecs_use_of_prototype():
     expect = np.zeros((10, 10), dtype="uint16", order="F")

From 36b1cb277a8d6b3a7eb9521e13b701b984fde110 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 13 Jun 2024 22:33:37 -0700
Subject: [PATCH 04/46] Adding GPU memory store implementation

---
 src/zarr/store/memory.py           | 62 +++++++++++++++++++++++++++++-
 tests/v3/test_store/test_memory.py | 37 +++++++++++++++++-
 2 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py
index 7b73330b6c..e1ebf9110c 100644
--- a/src/zarr/store/memory.py
+++ b/src/zarr/store/memory.py
@@ -3,7 +3,7 @@
 from collections.abc import AsyncGenerator, MutableMapping
 
 from zarr.abc.store import Store
-from zarr.buffer import Buffer, BufferPrototype
+from zarr.buffer import Buffer, BufferPrototype, GpuBuffer
 from zarr.common import OpenMode, concurrent_map
 from zarr.store.utils import _normalize_interval_index
 
@@ -101,3 +101,63 @@ async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
             for key in self._store_dict:
                 if key.startswith(prefix + "/") and key != prefix:
                     yield key.removeprefix(prefix + "/").split("/")[0]
+
+
+class GpuMemoryStore(MemoryStore):
+    _store_dict: MutableMapping[str, Buffer]
+
+    def __init__(
+        self, store_dict: MutableMapping[str, Buffer] | None = None, *, mode: OpenMode = "r"
+    ):
+        super().__init__(mode=mode)
+        self._store_dict = {}
+        if store_dict:
+            self._store_dict = {k: GpuBuffer.from_buffer(store_dict[k]) for k in iter(store_dict)}
+
+    def __str__(self) -> str:
+        return f"gpumemory://{id(self._store_dict)}"
+
+    def __repr__(self) -> str:
+        return f"GpuMemoryStore({str(self)!r})"
+
+    async def get(
+        self,
+        key: str,
+        prototype: BufferPrototype,
+        byte_range: tuple[int | None, int | None] | None = None,
+    ) -> Buffer | None:
+        assert isinstance(key, str)
+        try:
+            value = self._store_dict[key]
+            start, length = _normalize_interval_index(value, byte_range)
+            return value[start : start + length]
+        except KeyError:
+            return None
+
+    async def get_partial_values(
+        self,
+        prototype: BufferPrototype,
+        key_ranges: list[tuple[str, tuple[int | None, int | None]]],
+    ) -> list[Buffer | None]:
+        # All the key-ranges arguments goes with the same prototype
+        async def _get(key: str, byte_range: tuple[int, int | None]) -> Buffer | None:
+            # Q: use prototype here to convert to bespoke buffer class? If so, how?
+            return await self.get(key, prototype=prototype, byte_range=byte_range)
+
+        vals = await concurrent_map(key_ranges, _get, limit=None)
+        return vals
+
+    async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None = None) -> None:
+        self._check_writable()
+        assert isinstance(key, str)
+        if not isinstance(value, Buffer):
+            raise TypeError(f"Expected Buffer. Got {type(value)}.")
+
+        # Convert to GpuBuffer
+        gpu_value = value if isinstance(value, GpuBuffer) else GpuBuffer.from_buffer(value)
+        if byte_range is not None:
+            buf = self._store_dict[key]
+            buf[byte_range[0] : byte_range[1]] = gpu_value
+            self._store_dict[key] = buf
+        else:
+            self._store_dict[key] = gpu_value
diff --git a/tests/v3/test_store/test_memory.py b/tests/v3/test_store/test_memory.py
index 96b8b19e2c..a851cfc7c1 100644
--- a/tests/v3/test_store/test_memory.py
+++ b/tests/v3/test_store/test_memory.py
@@ -2,8 +2,8 @@
 
 import pytest
 
-from zarr.buffer import Buffer
-from zarr.store.memory import MemoryStore
+from zarr.buffer import Buffer, GpuBuffer
+from zarr.store.memory import GpuMemoryStore, MemoryStore
 from zarr.testing.store import StoreTests
 
 
@@ -38,3 +38,36 @@ def test_store_supports_partial_writes(self, store: MemoryStore) -> None:
 
     def test_list_prefix(self, store: MemoryStore) -> None:
         assert True
+
+
+class TestGpuMemoryStore(StoreTests[GpuMemoryStore]):
+    store_cls = GpuMemoryStore
+
+    def set(self, store: GpuMemoryStore, key: str, value: Buffer) -> None:
+        store._store_dict[key] = value
+
+    def get(self, store: MemoryStore, key: str) -> Buffer:
+        return store._store_dict[key]
+
+    @pytest.fixture(scope="function", params=[None, {}])
+    def store_kwargs(self, request) -> dict[str, str | None | dict[str, Buffer]]:
+        return {"store_dict": request.param, "mode": "w"}
+
+    @pytest.fixture(scope="function")
+    def store(self, store_kwargs: str | None | dict[str, GpuBuffer]) -> GpuMemoryStore:
+        return self.store_cls(**store_kwargs)
+
+    def test_store_repr(self, store: GpuMemoryStore) -> None:
+        assert str(store) == f"gpumemory://{id(store._store_dict)}"
+
+    def test_store_supports_writes(self, store: GpuMemoryStore) -> None:
+        assert store.supports_writes
+
+    def test_store_supports_listing(self, store: GpuMemoryStore) -> None:
+        assert store.supports_listing
+
+    def test_store_supports_partial_writes(self, store: GpuMemoryStore) -> None:
+        assert store.supports_partial_writes
+
+    def test_list_prefix(self, store: GpuMemoryStore) -> None:
+        assert True

From 04001b4458d6eafc51310bea1baaa44d01d75b16 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 17 Jun 2024 14:51:45 -0700
Subject: [PATCH 05/46] Addressing comments

---
 pyproject.toml     |  1 +
 src/zarr/buffer.py | 32 +++++++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 16745cb5b8..40f25a9c96 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -223,4 +223,5 @@ filterwarnings = [
     "error:::zarr.*",
     "ignore:PY_SSIZE_T_CLEAN will be required.*:DeprecationWarning",
     "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning",
+    "ignore:Creating a Gpu*:UserWarning",
 ]
diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
index dbfa0f04eb..a779fb1906 100644
--- a/src/zarr/buffer.py
+++ b/src/zarr/buffer.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import sys
+import warnings
 from collections.abc import Callable, Iterable, Sequence
 from typing import (
     TYPE_CHECKING,
@@ -489,7 +490,7 @@ class GpuBuffer(Buffer):
 
     def __init__(self, array_like: ArrayLike):
         if cp is None:
-            raise RuntimeError("Cannot use GpuBuffer without cupy")
+            raise ImportError("Cannot use GpuBuffer without cupy. Please install cupy.")
 
         if array_like.ndim != 1:
             raise ValueError("array_like: only 1-dim allowed")
@@ -501,9 +502,17 @@ def __init__(self, array_like: ArrayLike):
         if hasattr(array_like, "__cuda_array_interface__"):
             self._data = cp.asarray(array_like)
         else:
-            # raise ArgumentError("GpuBuffer only supports inputs that implement the '__cuda_array_interface__' protocol")
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
             # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
+            msg = (
+                "Creating a GpuBuffer with an array that does not support the "
+                "__cuda_array_interface__ for zero-copy transfers, "
+                "falling back to slow copy based path"
+            )
+            warnings.warn(
+                msg,
+                stacklevel=2,
+            )
             buffer = Buffer(array_like)
             self._data = cp.asarray(buffer.as_numpy_array())
 
@@ -568,7 +577,9 @@ def as_numpy_array(self) -> npt.NDArray[Any]:
         -------
             NumPy array of this buffer (might be a data copy)
         """
-        return cp.asnumpy(self._data)
+        from typing import cast
+
+        return cast(npt.NDArray[Any], cp.asnumpy(self._data))
 
     def __add__(self, other: Buffer) -> Self:
         """Concatenate two buffers"""
@@ -609,7 +620,7 @@ class GpuNDBuffer(NDBuffer):
 
     def __init__(self, array: NDArrayLike):
         if cp is None:
-            raise RuntimeError("Cannot use GpuNDBuffer without cupy")
+            raise ImportError("Cannot use GpuNDBuffer without cupy. Please install cupy.")
 
         # assert array.ndim > 0
         assert array.dtype != object
@@ -622,6 +633,15 @@ def __init__(self, array: NDArrayLike):
         else:
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
             # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
+            msg = (
+                "Creating a GpuNDBuffer with an array that does not support the "
+                "__cuda_array_interface__ for zero-copy transfers, "
+                "falling back to slow copy based path"
+            )
+            warnings.warn(
+                msg,
+                stacklevel=2,
+            )
             nd_buffer = NDBuffer(array)
             self._data = cp.asarray(nd_buffer.as_numpy_array())
 
@@ -714,7 +734,9 @@ def as_numpy_array(self) -> npt.NDArray[Any]:
         -------
             NumPy array of this buffer (might be a data copy)
         """
-        return cp.asnumpy(self._data)
+        from typing import cast
+
+        return cast(npt.NDArray[Any], cp.asnumpy(self._data))
 
     @property
     def dtype(self) -> np.dtype[Any]:

From 74a13c486787a0017b891527c75f8e1081a541e1 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 17 Jun 2024 16:34:19 -0700
Subject: [PATCH 06/46] Making GpuMemoryStore tests conditional on cupy being
 available

---
 tests/v3/test_store/test_memory.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/v3/test_store/test_memory.py b/tests/v3/test_store/test_memory.py
index a851cfc7c1..8d7c9f5499 100644
--- a/tests/v3/test_store/test_memory.py
+++ b/tests/v3/test_store/test_memory.py
@@ -6,6 +6,11 @@
 from zarr.store.memory import GpuMemoryStore, MemoryStore
 from zarr.testing.store import StoreTests
 
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
 
 class TestMemoryStore(StoreTests[MemoryStore]):
     store_cls = MemoryStore
@@ -40,6 +45,7 @@ def test_list_prefix(self, store: MemoryStore) -> None:
         assert True
 
 
+@pytest.mark.skipif(cp is None, reason="requires cupy")
 class TestGpuMemoryStore(StoreTests[GpuMemoryStore]):
     store_cls = GpuMemoryStore
 

From bdc0a243182616828959f37bd59fb88d53490f4f Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 17 Jun 2024 22:41:26 -0700
Subject: [PATCH 07/46] Adding test checking that existing host memory codecs
 use the gpu_buffer_prototype appropriately

---
 tests/v3/test_buffer.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py
index 9b89237dfa..08118799b0 100644
--- a/tests/v3/test_buffer.py
+++ b/tests/v3/test_buffer.py
@@ -173,3 +173,34 @@ async def test_codecs_use_of_prototype():
     got = await a.getitem(selection=(slice(0, 10), slice(0, 10)), prototype=my_prototype)
     assert isinstance(got, MyNDArrayLike)
     assert np.array_equal(expect, got)
+
+
+@pytest.mark.skipif(cp is None, reason="requires cupy")
+@pytest.mark.asyncio
+async def test_codecs_use_of_gpu_prototype():
+    expect = cp.zeros((10, 10), dtype="uint16", order="F")
+    a = await AsyncArray.create(
+        StorePath(MemoryStore(mode="w")) / "test_codecs_use_of_gpu_prototype",
+        shape=expect.shape,
+        chunk_shape=(5, 5),
+        dtype=expect.dtype,
+        fill_value=0,
+        codecs=[
+            TransposeCodec(order=(1, 0)),
+            BytesCodec(),
+            BloscCodec(),
+            Crc32cCodec(),
+            GzipCodec(),
+            ZstdCodec(),
+        ],
+    )
+    expect[:] = cp.arange(100).reshape(10, 10)
+
+    await a.setitem(
+        selection=(slice(0, 10), slice(0, 10)),
+        value=expect[:],
+        prototype=gpu_buffer_prototype,
+    )
+    got = await a.getitem(selection=(slice(0, 10), slice(0, 10)), prototype=gpu_buffer_prototype)
+    assert isinstance(got, cp.ndarray)
+    assert cp.array_equal(expect, got)

From d900aa33dbc1b5c2cfee905c12cf1b0c70ee034a Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 27 Jun 2024 22:26:57 -0700
Subject: [PATCH 08/46] Reducing code and docs duplication

---
 src/zarr/buffer.py | 134 ++-------------------------------------------
 1 file changed, 6 insertions(+), 128 deletions(-)

diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
index a779fb1906..2c336d0079 100644
--- a/src/zarr/buffer.py
+++ b/src/zarr/buffer.py
@@ -10,6 +10,7 @@
     NamedTuple,
     Protocol,
     SupportsIndex,
+    cast,
     runtime_checkable,
 )
 
@@ -497,11 +498,7 @@ def __init__(self, array_like: ArrayLike):
         if array_like.dtype != np.dtype("b"):
             raise ValueError("array_like: only byte dtype allowed")
 
-        # if type(array_like) == cp.ndarray
-        #     self._data = array_like
-        if hasattr(array_like, "__cuda_array_interface__"):
-            self._data = cp.asarray(array_like)
-        else:
+        if not hasattr(array_like, "__cuda_array_interface__"):
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
             # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
             msg = (
@@ -513,8 +510,7 @@ def __init__(self, array_like: ArrayLike):
                 msg,
                 stacklevel=2,
             )
-            buffer = Buffer(array_like)
-            self._data = cp.asarray(buffer.as_numpy_array())
+        self._data = cp.asarray(array_like)
 
     @classmethod
     def create_zero_length(cls) -> Self:
@@ -532,58 +528,18 @@ def from_buffer(cls, buffer: Buffer) -> Self:
 
         Returns
         -------
-            New empty 0-length buffer
+            New GpuBuffer constructed from `buffer`
         """
         return cls(buffer.as_array_like())
 
-    @classmethod
-    def from_array_like(cls, array_like: ArrayLike) -> Self:
-        """Create a new buffer of a array-like object
-
-        Parameters
-        ----------
-        array_like
-            array-like object that must be 1-dim, contiguous, and byte dtype.
-
-        Returns
-        -------
-            New buffer representing `array_like`
-        """
-        return cls(array_like)
-
     @classmethod
     def from_bytes(cls, bytes_like: BytesLike) -> Self:
-        """Create a new buffer of a bytes-like object (host memory)
-
-        Parameters
-        ----------
-        bytes_like
-           bytes-like object
-
-        Returns
-        -------
-            New buffer representing `bytes_like`
-        """
         return cls.from_array_like(cp.frombuffer(bytes_like, dtype="b"))
 
     def as_numpy_array(self) -> npt.NDArray[Any]:
-        """Returns the buffer as a NumPy array (host memory).
-
-        Warning
-        -------
-        Might have to copy data, consider using `.as_array_like()` instead.
-
-        Returns
-        -------
-            NumPy array of this buffer (might be a data copy)
-        """
-        from typing import cast
-
         return cast(npt.NDArray[Any], cp.asnumpy(self._data))
 
     def __add__(self, other: Buffer) -> Self:
-        """Concatenate two buffers"""
-
         other_array = other.as_array_like()
         assert other_array.dtype == np.dtype("b")
         gpu_other = GpuBuffer(other_array)
@@ -626,11 +582,7 @@ def __init__(self, array: NDArrayLike):
         assert array.dtype != object
         self._data = array
 
-        # if isinstance(array_like, cp.ndarray):
-        #     self._data = array_like
-        if hasattr(array, "__cuda_array_interface__"):
-            self._data = cp.asarray(array)
-        else:
+        if not hasattr(array, "__cuda_array_interface__"):
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
             # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
             msg = (
@@ -642,8 +594,7 @@ def __init__(self, array: NDArrayLike):
                 msg,
                 stacklevel=2,
             )
-            nd_buffer = NDBuffer(array)
-            self._data = cp.asarray(nd_buffer.as_numpy_array())
+        self._data = cp.asarray(array)
 
     @classmethod
     def create(
@@ -654,49 +605,11 @@ def create(
         order: Literal["C", "F"] = "C",
         fill_value: Any | None = None,
     ) -> Self:
-        """Create a new buffer and its underlying ndarray-like object
-
-        Parameters
-        ----------
-        shape
-            The shape of the buffer and its underlying ndarray-like object
-        dtype
-            The datatype of the buffer and its underlying ndarray-like object
-        order
-            Whether to store multi-dimensional data in row-major (C-style) or
-            column-major (Fortran-style) order in memory.
-        fill_value
-            If not None, fill the new buffer with a scalar value.
-
-        Returns
-        -------
-            New buffer representing a new ndarray_like object
-
-        Developer Notes
-        ---------------
-        A subclass can overwrite this method to create a ndarray-like object
-        other then the default Numpy array.
-        """
         ret = cls(cp.empty(shape=tuple(shape), dtype=dtype, order=order))
         if fill_value is not None:
             ret.fill(fill_value)
         return ret
 
-    @classmethod
-    def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self:
-        """Create a new buffer of a ndarray-like object
-
-        Parameters
-        ----------
-        ndarray_like
-            ndarray-like object
-
-        Returns
-        -------
-            New buffer representing `ndarray_like`
-        """
-        return cls(ndarray_like)
-
     @classmethod
     def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
         """Create a new buffer of Numpy array-like object
@@ -712,17 +625,6 @@ def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
         """
         return cls(cp.asarray(array_like))
 
-    def as_ndarray_like(self) -> NDArrayLike:
-        """Returns the underlying array (host or device memory) of this buffer
-
-        This will never copy data.
-
-        Returns
-        -------
-            The underlying array such as a NumPy or CuPy array.
-        """
-        return self._data
-
     def as_numpy_array(self) -> npt.NDArray[Any]:
         """Returns the buffer as a NumPy array (host memory).
 
@@ -734,29 +636,8 @@ def as_numpy_array(self) -> npt.NDArray[Any]:
         -------
             NumPy array of this buffer (might be a data copy)
         """
-        from typing import cast
-
         return cast(npt.NDArray[Any], cp.asnumpy(self._data))
 
-    @property
-    def dtype(self) -> np.dtype[Any]:
-        return self._data.dtype
-
-    @property
-    def shape(self) -> tuple[int, ...]:
-        return self._data.shape
-
-    @property
-    def byteorder(self) -> Endian:
-        from zarr.codecs.bytes import Endian
-
-        if self.dtype.byteorder == "<":
-            return Endian.little
-        elif self.dtype.byteorder == ">":
-            return Endian.big
-        else:
-            return Endian(sys.byteorder)
-
     def __getitem__(self, key: Any) -> Self:
         return self.__class__(self._data.__getitem__(key))
 
@@ -768,9 +649,6 @@ def __setitem__(self, key: Any, value: Any) -> None:
             value = gpu_value._data
         self._data.__setitem__(key, value)
 
-    def transpose(self, axes: SupportsIndex | Sequence[SupportsIndex] | None) -> Self:
-        return self.__class__(self._data.transpose(axes))
-
 
 class BufferPrototype(NamedTuple):
     """Prototype of the Buffer and NDBuffer class

From 0eca79517ce8846a25f730c8fa77d0c81ce459bd Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 27 Jun 2024 23:13:05 -0700
Subject: [PATCH 09/46] Formatting

---
 src/zarr/buffer.py       | 18 ++++++++++++++++++
 src/zarr/store/memory.py | 17 ++++++++---------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
index 2c336d0079..487ab9faa7 100644
--- a/src/zarr/buffer.py
+++ b/src/zarr/buffer.py
@@ -151,6 +151,22 @@ def create_zero_length(cls) -> Self:
         """
         return cls(np.array([], dtype="b"))
 
+    @classmethod
+    def from_buffer(cls, buffer: Buffer) -> Self:
+        """Create a Buffer given another arbitrary Buffer
+
+        Returns
+        -------
+            New Buffer representing the same data as `buffer`
+
+
+        Note
+        ----
+        Subclassed of Buffer must override this method to implement
+        more optimal conversions that avoid copies where possible
+        """
+        return cls(buffer.as_numpy_array())
+
     @classmethod
     def from_array_like(cls, array_like: ArrayLike) -> Self:
         """Create a new buffer of an array-like object
@@ -525,6 +541,8 @@ def create_zero_length(cls) -> Self:
     @classmethod
     def from_buffer(cls, buffer: Buffer) -> Self:
         """Create an GpuBuffer given an arbitrary Buffer
+        This will try to be zero-copy if `buffer` is already on the
+        GPU and will trigger a copy if not.
 
         Returns
         -------
diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py
index e1ebf9110c..52058750e9 100644
--- a/src/zarr/store/memory.py
+++ b/src/zarr/store/memory.py
@@ -104,13 +104,18 @@ async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
 
 
 class GpuMemoryStore(MemoryStore):
+    """A GPU only memory store that stores every chunk in GPU memory irrespective
+    of the original location. This guarantees that chunks will always be in GPU
+    memory for downstream processing. For location agnostic use cases, it would
+    be better to use `MemoryStore` instead.
+    """
+
     _store_dict: MutableMapping[str, Buffer]
 
     def __init__(
         self, store_dict: MutableMapping[str, Buffer] | None = None, *, mode: OpenMode = "r"
     ):
         super().__init__(mode=mode)
-        self._store_dict = {}
         if store_dict:
             self._store_dict = {k: GpuBuffer.from_buffer(store_dict[k]) for k in iter(store_dict)}
 
@@ -130,7 +135,7 @@ async def get(
         try:
             value = self._store_dict[key]
             start, length = _normalize_interval_index(value, byte_range)
-            return value[start : start + length]
+            return prototype.buffer.from_buffer(value[start : start + length])
         except KeyError:
             return None
 
@@ -141,7 +146,6 @@ async def get_partial_values(
     ) -> list[Buffer | None]:
         # All the key-ranges arguments goes with the same prototype
         async def _get(key: str, byte_range: tuple[int, int | None]) -> Buffer | None:
-            # Q: use prototype here to convert to bespoke buffer class? If so, how?
             return await self.get(key, prototype=prototype, byte_range=byte_range)
 
         vals = await concurrent_map(key_ranges, _get, limit=None)
@@ -155,9 +159,4 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None
 
         # Convert to GpuBuffer
         gpu_value = value if isinstance(value, GpuBuffer) else GpuBuffer.from_buffer(value)
-        if byte_range is not None:
-            buf = self._store_dict[key]
-            buf[byte_range[0] : byte_range[1]] = gpu_value
-            self._store_dict[key] = buf
-        else:
-            self._store_dict[key] = gpu_value
+        await super().set(key, gpu_value, byte_range=byte_range)

From d9ed6c4cc9cdc17fe7ac9f136f5be0d4d15c0733 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 27 Jun 2024 23:36:16 -0700
Subject: [PATCH 10/46] Fixing silent rebase conflicts

---
 src/zarr/buffer.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/src/zarr/buffer.py b/src/zarr/buffer.py
index 487ab9faa7..1668ac7c23 100644
--- a/src/zarr/buffer.py
+++ b/src/zarr/buffer.py
@@ -151,22 +151,6 @@ def create_zero_length(cls) -> Self:
         """
         return cls(np.array([], dtype="b"))
 
-    @classmethod
-    def from_buffer(cls, buffer: Buffer) -> Self:
-        """Create a Buffer given another arbitrary Buffer
-
-        Returns
-        -------
-            New Buffer representing the same data as `buffer`
-
-
-        Note
-        ----
-        Subclassed of Buffer must override this method to implement
-        more optimal conversions that avoid copies where possible
-        """
-        return cls(buffer.as_numpy_array())
-
     @classmethod
     def from_array_like(cls, array_like: ArrayLike) -> Self:
         """Create a new buffer of an array-like object
@@ -202,8 +186,13 @@ def from_buffer(cls, buffer: Buffer) -> Self:
         Returns
         -------
             A new buffer representing the content of the input buffer
+
+        Note
+        ----
+        Subclasses of `Buffer` must override this method to implement
+        more optimal conversions that avoid copies where possible
         """
-        return cls.from_array_like(buffer.as_array_like())
+        return cls.from_array_like(buffer.as_numpy_array())
 
     @classmethod
     def from_bytes(cls, bytes_like: BytesLike) -> Self:

From 5405e38dc1a79622c14e282a817b739bb918186b Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 28 Jun 2024 08:03:05 -0700
Subject: [PATCH 11/46] Reducing code duplication in GpuMemoryStore

---
 src/zarr/store/memory.py | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py
index 52058750e9..3e9bd2a513 100644
--- a/src/zarr/store/memory.py
+++ b/src/zarr/store/memory.py
@@ -125,32 +125,6 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return f"GpuMemoryStore({str(self)!r})"
 
-    async def get(
-        self,
-        key: str,
-        prototype: BufferPrototype,
-        byte_range: tuple[int | None, int | None] | None = None,
-    ) -> Buffer | None:
-        assert isinstance(key, str)
-        try:
-            value = self._store_dict[key]
-            start, length = _normalize_interval_index(value, byte_range)
-            return prototype.buffer.from_buffer(value[start : start + length])
-        except KeyError:
-            return None
-
-    async def get_partial_values(
-        self,
-        prototype: BufferPrototype,
-        key_ranges: list[tuple[str, tuple[int | None, int | None]]],
-    ) -> list[Buffer | None]:
-        # All the key-ranges arguments goes with the same prototype
-        async def _get(key: str, byte_range: tuple[int, int | None]) -> Buffer | None:
-            return await self.get(key, prototype=prototype, byte_range=byte_range)
-
-        vals = await concurrent_map(key_ranges, _get, limit=None)
-        return vals
-
     async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None = None) -> None:
         self._check_writable()
         assert isinstance(key, str)

From 28587010e76b293d081dee5c67fe94490601c59e Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:00:36 -0700
Subject: [PATCH 12/46] Refactoring to an abstract Buffer class and concrete
 CPU and GPU implementations of those

---
 src/zarr/array.py                      |  28 ++-
 src/zarr/buffer/__init__.py            |  17 ++
 src/zarr/{buffer.py => buffer/core.py} | 271 +++----------------------
 src/zarr/buffer/cpu.py                 | 213 +++++++++++++++++++
 src/zarr/buffer/gpu.py                 | 212 +++++++++++++++++++
 src/zarr/codecs/blosc.py               |   3 +-
 src/zarr/codecs/gzip.py                |   3 +-
 src/zarr/codecs/zstd.py                |   3 +-
 src/zarr/store/memory.py               |   3 +-
 tests/v3/test_buffer.py                |  16 +-
 10 files changed, 504 insertions(+), 265 deletions(-)
 create mode 100644 src/zarr/buffer/__init__.py
 rename src/zarr/{buffer.py => buffer/core.py} (58%)
 create mode 100644 src/zarr/buffer/cpu.py
 create mode 100644 src/zarr/buffer/gpu.py

diff --git a/src/zarr/array.py b/src/zarr/array.py
index 26a19e64ab..313bc4a575 100644
--- a/src/zarr/array.py
+++ b/src/zarr/array.py
@@ -22,7 +22,11 @@
 from zarr.attributes import Attributes
 from zarr.buffer import BufferPrototype, NDArrayLike, NDBuffer, default_buffer_prototype
 from zarr.chunk_grids import RegularChunkGrid
-from zarr.chunk_key_encodings import ChunkKeyEncoding, DefaultChunkKeyEncoding, V2ChunkKeyEncoding
+from zarr.chunk_key_encodings import (
+    ChunkKeyEncoding,
+    DefaultChunkKeyEncoding,
+    V2ChunkKeyEncoding,
+)
 from zarr.codecs import BytesCodec
 from zarr.codecs._v2 import V2Compressor, V2Filters
 from zarr.codecs.pipeline import BatchedCodecPipeline
@@ -76,7 +80,9 @@ def parse_array_metadata(data: Any) -> ArrayV2Metadata | ArrayV3Metadata:
     raise TypeError
 
 
-def create_codec_pipeline(metadata: ArrayV2Metadata | ArrayV3Metadata) -> BatchedCodecPipeline:
+def create_codec_pipeline(
+    metadata: ArrayV2Metadata | ArrayV3Metadata,
+) -> BatchedCodecPipeline:
     if isinstance(metadata, ArrayV3Metadata):
         return BatchedCodecPipeline.from_list(metadata.codecs)
     elif isinstance(metadata, ArrayV2Metadata):
@@ -474,7 +480,10 @@ async def _get_selection(
         return out_buffer.as_ndarray_like()
 
     async def getitem(
-        self, selection: BasicSelection, *, prototype: BufferPrototype = default_buffer_prototype
+        self,
+        selection: BasicSelection,
+        *,
+        prototype: BufferPrototype = default_buffer_prototype,
     ) -> NDArrayLike:
         indexer = BasicIndexer(
             selection,
@@ -502,7 +511,12 @@ async def _set_selection(
 
         # check value shape
         if np.isscalar(value):
-            value = np.asanyarray(value, dtype=self.metadata.dtype)
+            array_like = prototype.buffer.create_zero_length().as_array_like()
+            if isinstance(array_like, np._typing._SupportsArrayFunc):
+                # TODO: need to handle array types that don't support __array_function__
+                # like PyTorch and JAX
+                array_like_ = cast(np._typing._SupportsArrayFunc, array_like)
+            value = np.asanyarray(value, dtype=self.metadata.dtype, like=array_like_)
         else:
             if not hasattr(value, "shape"):
                 value = np.asarray(value, self.metadata.dtype)
@@ -510,7 +524,11 @@ async def _set_selection(
             #     value.shape == indexer.shape
             # ), f"shape of value doesn't match indexer shape. Expected {indexer.shape}, got {value.shape}"
             if not hasattr(value, "dtype") or value.dtype.name != self.metadata.dtype.name:
-                value = np.array(value, dtype=self.metadata.dtype, order="A")
+                if hasattr(value, "astype"):
+                    # Handle things that are already NDArrayLike more efficiently
+                    value = value.astype(dtype=self.metadata.dtype, order="A")
+                else:
+                    value = np.array(value, dtype=self.metadata.dtype, order="A")
         value = cast(NDArrayLike, value)
         # We accept any ndarray like object from the user and convert it
         # to a NDBuffer (or subclass). From this point onwards, we only pass
diff --git a/src/zarr/buffer/__init__.py b/src/zarr/buffer/__init__.py
new file mode 100644
index 0000000000..d4de3b2786
--- /dev/null
+++ b/src/zarr/buffer/__init__.py
@@ -0,0 +1,17 @@
+from zarr.buffer.core import (
+    ArrayLike,
+    Buffer,
+    BufferPrototype,
+    NDArrayLike,
+    NDBuffer,
+)
+from zarr.buffer.cpu import default_buffer_prototype
+
+__all__ = [
+    "ArrayLike",
+    "Buffer",
+    "NDArrayLike",
+    "NDBuffer",
+    "BufferPrototype",
+    "default_buffer_prototype",
+]
diff --git a/src/zarr/buffer.py b/src/zarr/buffer/core.py
similarity index 58%
rename from src/zarr/buffer.py
rename to src/zarr/buffer/core.py
index 1668ac7c23..755a3710d0 100644
--- a/src/zarr/buffer.py
+++ b/src/zarr/buffer/core.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
 import sys
-import warnings
-from collections.abc import Callable, Iterable, Sequence
+from abc import ABC, abstractmethod
+from collections.abc import Iterable, Sequence
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -10,7 +10,6 @@
     NamedTuple,
     Protocol,
     SupportsIndex,
-    cast,
     runtime_checkable,
 )
 
@@ -25,11 +24,6 @@
     from zarr.codecs.bytes import Endian
     from zarr.common import BytesLike
 
-try:
-    import cupy as cp
-except ImportError:
-    cp = None
-
 
 @runtime_checkable
 class ArrayLike(Protocol):
@@ -113,7 +107,7 @@ def check_item_key_is_1d_contiguous(key: Any) -> None:
         raise ValueError("slice must be contiguous")
 
 
-class Buffer:
+class Buffer(ABC):
     """A flat contiguous memory block
 
     We use Buffer throughout Zarr to represent a contiguous block of memory.
@@ -142,6 +136,7 @@ def __init__(self, array_like: ArrayLike):
         self._data = array_like
 
     @classmethod
+    @abstractmethod
     def create_zero_length(cls) -> Self:
         """Create an empty buffer with length zero
 
@@ -149,7 +144,7 @@ def create_zero_length(cls) -> Self:
         -------
             New empty 0-length buffer
         """
-        return cls(np.array([], dtype="b"))
+        ...
 
     @classmethod
     def from_array_like(cls, array_like: ArrayLike) -> Self:
@@ -167,6 +162,7 @@ def from_array_like(cls, array_like: ArrayLike) -> Self:
         return cls(array_like)
 
     @classmethod
+    @abstractmethod
     def from_buffer(cls, buffer: Buffer) -> Self:
         """Create a new buffer of an existing Buffer
 
@@ -192,9 +188,10 @@ def from_buffer(cls, buffer: Buffer) -> Self:
         Subclasses of `Buffer` must override this method to implement
         more optimal conversions that avoid copies where possible
         """
-        return cls.from_array_like(buffer.as_numpy_array())
+        ...
 
     @classmethod
+    @abstractmethod
     def from_bytes(cls, bytes_like: BytesLike) -> Self:
         """Create a new buffer of a bytes-like object (host memory)
 
@@ -207,7 +204,7 @@ def from_bytes(cls, bytes_like: BytesLike) -> Self:
         -------
             New buffer representing `bytes_like`
         """
-        return cls.from_array_like(np.frombuffer(bytes_like, dtype="b"))
+        ...
 
     def as_array_like(self) -> ArrayLike:
         """Returns the underlying array (host or device memory) of this buffer
@@ -220,6 +217,7 @@ def as_array_like(self) -> ArrayLike:
         """
         return self._data
 
+    @abstractmethod
     def as_numpy_array(self) -> npt.NDArray[Any]:
         """Returns the buffer as a NumPy array (host memory).
 
@@ -231,7 +229,7 @@ def as_numpy_array(self) -> npt.NDArray[Any]:
         -------
             NumPy array of this buffer (might be a data copy)
         """
-        return np.asanyarray(self._data)
+        ...
 
     def to_bytes(self) -> bytes:
         """Returns the buffer as `bytes` (host memory).
@@ -258,14 +256,10 @@ def __setitem__(self, key: slice, value: Any) -> None:
     def __len__(self) -> int:
         return self._data.size
 
+    @abstractmethod
     def __add__(self, other: Buffer) -> Self:
         """Concatenate two buffers"""
-
-        other_array = other.as_array_like()
-        assert other_array.dtype == np.dtype("b")
-        return self.__class__(
-            np.concatenate((np.asanyarray(self._data), np.asanyarray(other_array)))
-        )
+        ...
 
 
 class NDBuffer:
@@ -299,6 +293,7 @@ def __init__(self, array: NDArrayLike):
         self._data = array
 
     @classmethod
+    @abstractmethod
     def create(
         cls,
         *,
@@ -330,10 +325,7 @@ def create(
         A subclass can overwrite this method to create a ndarray-like object
         other then the default Numpy array.
         """
-        ret = cls(np.empty(shape=tuple(shape), dtype=dtype, order=order))
-        if fill_value is not None:
-            ret.fill(fill_value)
-        return ret
+        ...
 
     @classmethod
     def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self:
@@ -351,6 +343,7 @@ def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self:
         return cls(ndarray_like)
 
     @classmethod
+    @abstractmethod
     def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
         """Create a new buffer of Numpy array-like object
 
@@ -363,7 +356,7 @@ def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
         -------
             New buffer representing `array_like`
         """
-        return cls.from_ndarray_like(np.asanyarray(array_like))
+        ...
 
     def as_ndarray_like(self) -> NDArrayLike:
         """Returns the underlying array (host or device memory) of this buffer
@@ -376,6 +369,7 @@ def as_ndarray_like(self) -> NDArrayLike:
         """
         return self._data
 
+    @abstractmethod
     def as_numpy_array(self) -> npt.NDArray[Any]:
         """Returns the buffer as a NumPy array (host memory).
 
@@ -387,7 +381,7 @@ def as_numpy_array(self) -> npt.NDArray[Any]:
         -------
             NumPy array of this buffer (might be a data copy)
         """
-        return np.asanyarray(self._data)
+        ...
 
     @property
     def dtype(self) -> np.dtype[Any]:
@@ -418,13 +412,11 @@ def squeeze(self, axis: tuple[int, ...]) -> Self:
     def astype(self, dtype: npt.DTypeLike, order: Literal["K", "A", "C", "F"] = "K") -> Self:
         return self.__class__(self._data.astype(dtype=dtype, order=order))
 
-    def __getitem__(self, key: Any) -> Self:
-        return self.__class__(np.asanyarray(self._data.__getitem__(key)))
+    @abstractmethod
+    def __getitem__(self, key: Any) -> Self: ...
 
-    def __setitem__(self, key: Any, value: Any) -> None:
-        if isinstance(value, NDBuffer):
-            value = value._data
-        self._data.__setitem__(key, value)
+    @abstractmethod
+    def __setitem__(self, key: Any, value: Any) -> None: ...
 
     def __len__(self) -> int:
         return self._data.__len__()
@@ -445,218 +437,6 @@ def transpose(self, axes: SupportsIndex | Sequence[SupportsIndex] | None) -> Sel
         return self.__class__(self._data.transpose(axes))
 
 
-def as_numpy_array_wrapper(
-    func: Callable[[npt.NDArray[Any]], bytes], buf: Buffer, prototype: BufferPrototype
-) -> Buffer:
-    """Converts the input of `func` to a numpy array and the output back to `Buffer`.
-
-    This function is useful when calling a `func` that only support host memory such
-    as `GZip.decode` and `Blosc.decode`. In this case, use this wrapper to convert
-    the input `buf` to a Numpy array and convert the result back into a `Buffer`.
-
-    Parameters
-    ----------
-    func
-        The callable that will be called with the converted `buf` as input.
-        `func` must return bytes, which will be converted into a `Buffer`
-        before returned.
-    buf
-        The buffer that will be converted to a Numpy array before given as
-        input to `func`.
-    prototype
-        The prototype of the output buffer.
-
-    Returns
-    -------
-        The result of `func` converted to a `Buffer`
-    """
-    return prototype.buffer.from_bytes(func(buf.as_numpy_array()))
-
-
-class GpuBuffer(Buffer):
-    """A flat contiguous memory block on the GPU
-
-    We use Buffer throughout Zarr to represent a contiguous block of memory.
-
-    A Buffer is backed by a underlying array-like instance that represents
-    the memory. The memory type is unspecified; can be regular host memory,
-    CUDA device memory, or something else. The only requirement is that the
-    array-like instance can be copied/converted to a regular Numpy array
-    (host memory).
-
-    Note
-    ----
-    This buffer is untyped, so all indexing and sizes are in bytes.
-
-    Parameters
-    ----------
-    array_like
-        array-like object that must be 1-dim, contiguous, and byte dtype.
-    """
-
-    def __init__(self, array_like: ArrayLike):
-        if cp is None:
-            raise ImportError("Cannot use GpuBuffer without cupy. Please install cupy.")
-
-        if array_like.ndim != 1:
-            raise ValueError("array_like: only 1-dim allowed")
-        if array_like.dtype != np.dtype("b"):
-            raise ValueError("array_like: only byte dtype allowed")
-
-        if not hasattr(array_like, "__cuda_array_interface__"):
-            # Slow copy based path for arrays that don't support the __cuda_array_interface__
-            # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
-            msg = (
-                "Creating a GpuBuffer with an array that does not support the "
-                "__cuda_array_interface__ for zero-copy transfers, "
-                "falling back to slow copy based path"
-            )
-            warnings.warn(
-                msg,
-                stacklevel=2,
-            )
-        self._data = cp.asarray(array_like)
-
-    @classmethod
-    def create_zero_length(cls) -> Self:
-        """Create an empty buffer with length zero
-
-        Returns
-        -------
-            New empty 0-length buffer
-        """
-        return cls(cp.array([], dtype="b"))
-
-    @classmethod
-    def from_buffer(cls, buffer: Buffer) -> Self:
-        """Create an GpuBuffer given an arbitrary Buffer
-        This will try to be zero-copy if `buffer` is already on the
-        GPU and will trigger a copy if not.
-
-        Returns
-        -------
-            New GpuBuffer constructed from `buffer`
-        """
-        return cls(buffer.as_array_like())
-
-    @classmethod
-    def from_bytes(cls, bytes_like: BytesLike) -> Self:
-        return cls.from_array_like(cp.frombuffer(bytes_like, dtype="b"))
-
-    def as_numpy_array(self) -> npt.NDArray[Any]:
-        return cast(npt.NDArray[Any], cp.asnumpy(self._data))
-
-    def __add__(self, other: Buffer) -> Self:
-        other_array = other.as_array_like()
-        assert other_array.dtype == np.dtype("b")
-        gpu_other = GpuBuffer(other_array)
-        gpu_other_array = gpu_other.as_array_like()
-        return self.__class__(
-            cp.concatenate((cp.asanyarray(self._data), cp.asanyarray(gpu_other_array)))
-        )
-
-
-class GpuNDBuffer(NDBuffer):
-    """A n-dimensional memory block on the GPU
-
-    We use NDBuffer throughout Zarr to represent a n-dimensional memory block.
-
-    A NDBuffer is backed by a underlying ndarray-like instance that represents
-    the memory. The memory type is unspecified; can be regular host memory,
-    CUDA device memory, or something else. The only requirement is that the
-    ndarray-like instance can be copied/converted to a regular Numpy array
-    (host memory).
-
-    Note
-    ----
-    The two buffer classes Buffer and NDBuffer are very similar. In fact, Buffer
-    is a special case of NDBuffer where dim=1, stride=1, and dtype="b". However,
-    in order to use Python's type system to differentiate between the contiguous
-    Buffer and the n-dim (non-contiguous) NDBuffer, we keep the definition of the
-    two classes separate.
-
-    Parameters
-    ----------
-    ndarray_like
-        ndarray-like object that is convertible to a regular Numpy array.
-    """
-
-    def __init__(self, array: NDArrayLike):
-        if cp is None:
-            raise ImportError("Cannot use GpuNDBuffer without cupy. Please install cupy.")
-
-        # assert array.ndim > 0
-        assert array.dtype != object
-        self._data = array
-
-        if not hasattr(array, "__cuda_array_interface__"):
-            # Slow copy based path for arrays that don't support the __cuda_array_interface__
-            # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
-            msg = (
-                "Creating a GpuNDBuffer with an array that does not support the "
-                "__cuda_array_interface__ for zero-copy transfers, "
-                "falling back to slow copy based path"
-            )
-            warnings.warn(
-                msg,
-                stacklevel=2,
-            )
-        self._data = cp.asarray(array)
-
-    @classmethod
-    def create(
-        cls,
-        *,
-        shape: Iterable[int],
-        dtype: npt.DTypeLike,
-        order: Literal["C", "F"] = "C",
-        fill_value: Any | None = None,
-    ) -> Self:
-        ret = cls(cp.empty(shape=tuple(shape), dtype=dtype, order=order))
-        if fill_value is not None:
-            ret.fill(fill_value)
-        return ret
-
-    @classmethod
-    def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
-        """Create a new buffer of Numpy array-like object
-
-        Parameters
-        ----------
-        array_like
-            Object that can be coerced into a Numpy array
-
-        Returns
-        -------
-            New buffer representing `array_like`
-        """
-        return cls(cp.asarray(array_like))
-
-    def as_numpy_array(self) -> npt.NDArray[Any]:
-        """Returns the buffer as a NumPy array (host memory).
-
-        Warning
-        -------
-        Might have to copy data, consider using `.as_ndarray_like()` instead.
-
-        Returns
-        -------
-            NumPy array of this buffer (might be a data copy)
-        """
-        return cast(npt.NDArray[Any], cp.asnumpy(self._data))
-
-    def __getitem__(self, key: Any) -> Self:
-        return self.__class__(self._data.__getitem__(key))
-
-    def __setitem__(self, key: Any, value: Any) -> None:
-        if isinstance(value, GpuNDBuffer):
-            value = value._data
-        elif isinstance(value, NDBuffer):
-            gpu_value = GpuNDBuffer(value.as_ndarray_like())
-            value = gpu_value._data
-        self._data.__setitem__(key, value)
-
-
 class BufferPrototype(NamedTuple):
     """Prototype of the Buffer and NDBuffer class
 
@@ -672,8 +452,3 @@ class BufferPrototype(NamedTuple):
 
     buffer: type[Buffer]
     nd_buffer: type[NDBuffer]
-
-
-# The default buffer prototype used throughout the Zarr codebase.
-default_buffer_prototype = BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
-gpu_buffer_prototype = BufferPrototype(buffer=GpuBuffer, nd_buffer=GpuNDBuffer)
diff --git a/src/zarr/buffer/cpu.py b/src/zarr/buffer/cpu.py
new file mode 100644
index 0000000000..d791d8e696
--- /dev/null
+++ b/src/zarr/buffer/cpu.py
@@ -0,0 +1,213 @@
+from __future__ import annotations
+
+from collections.abc import Callable, Iterable
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+)
+
+import numpy as np
+import numpy.typing as npt
+
+from zarr.buffer import ArrayLike, NDArrayLike, core
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+    from zarr.common import BytesLike
+
+
+class Buffer(core.Buffer):
+    """A flat contiguous memory block
+
+    We use Buffer throughout Zarr to represent a contiguous block of memory.
+
+    A Buffer is backed by a underlying array-like instance that represents
+    the memory. The memory type is unspecified; can be regular host memory,
+    CUDA device memory, or something else. The only requirement is that the
+    array-like instance can be copied/converted to a regular Numpy array
+    (host memory).
+
+    Notes
+    -----
+    This buffer is untyped, so all indexing and sizes are in bytes.
+
+    Parameters
+    ----------
+    array_like
+        array-like object that must be 1-dim, contiguous, and byte dtype.
+    """
+
+    def __init__(self, array_like: ArrayLike):
+        super().__init__(array_like)
+
+    @classmethod
+    def create_zero_length(cls) -> Self:
+        return cls(np.array([], dtype="b"))
+
+    @classmethod
+    def from_buffer(cls, buffer: core.Buffer) -> Self:
+        """Create a new buffer of an existing Buffer
+
+        This is useful if you want to ensure that an existing buffer is
+        of the correct subclass of Buffer. E.g., MemoryStore uses this
+        to return a buffer instance of the subclass specified by its
+        BufferPrototype argument.
+
+        Typically, this only copies data if the data has to be moved between
+        memory types, such as from host to device memory.
+
+        Parameters
+        ----------
+        buffer
+            buffer object.
+
+        Returns
+        -------
+            A new buffer representing the content of the input buffer
+
+        Note
+        ----
+        Subclasses of `Buffer` must override this method to implement
+        more optimal conversions that avoid copies where possible
+        """
+        return cls.from_array_like(buffer.as_numpy_array())
+
+    @classmethod
+    def from_bytes(cls, bytes_like: BytesLike) -> Self:
+        """Create a new buffer of a bytes-like object (host memory)
+
+        Parameters
+        ----------
+        bytes_like
+           bytes-like object
+
+        Returns
+        -------
+            New buffer representing `bytes_like`
+        """
+        return cls.from_array_like(np.frombuffer(bytes_like, dtype="b"))
+
+    def as_numpy_array(self) -> npt.NDArray[Any]:
+        """Returns the buffer as a NumPy array (host memory).
+
+        Notes
+        -----
+        Might have to copy data, consider using `.as_array_like()` instead.
+
+        Returns
+        -------
+            NumPy array of this buffer (might be a data copy)
+        """
+        return np.asanyarray(self._data)
+
+    def __add__(self, other: core.Buffer) -> Self:
+        """Concatenate two buffers"""
+
+        other_array = other.as_array_like()
+        assert other_array.dtype == np.dtype("b")
+        return self.__class__(
+            np.concatenate((np.asanyarray(self._data), np.asanyarray(other_array)))
+        )
+
+
+class NDBuffer(core.NDBuffer):
+    """An n-dimensional memory block
+
+    We use NDBuffer throughout Zarr to represent a n-dimensional memory block.
+
+    A NDBuffer is backed by a underlying ndarray-like instance that represents
+    the memory. The memory type is unspecified; can be regular host memory,
+    CUDA device memory, or something else. The only requirement is that the
+    ndarray-like instance can be copied/converted to a regular Numpy array
+    (host memory).
+
+    Notes
+    -----
+    The two buffer classes Buffer and NDBuffer are very similar. In fact, Buffer
+    is a special case of NDBuffer where dim=1, stride=1, and dtype="b". However,
+    in order to use Python's type system to differentiate between the contiguous
+    Buffer and the n-dim (non-contiguous) NDBuffer, we keep the definition of the
+    two classes separate.
+
+    Parameters
+    ----------
+    ndarray_like
+        ndarray-like object that is convertible to a regular Numpy array.
+    """
+
+    def __init__(self, array: NDArrayLike):
+        super().__init__(array)
+
+    @classmethod
+    def create(
+        cls,
+        *,
+        shape: Iterable[int],
+        dtype: npt.DTypeLike,
+        order: Literal["C", "F"] = "C",
+        fill_value: Any | None = None,
+    ) -> Self:
+        ret = cls(np.empty(shape=tuple(shape), dtype=dtype, order=order))
+        if fill_value is not None:
+            ret.fill(fill_value)
+        return ret
+
+    @classmethod
+    def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
+        return cls.from_ndarray_like(np.asanyarray(array_like))
+
+    def as_numpy_array(self) -> npt.NDArray[Any]:
+        """Returns the buffer as a NumPy array (host memory).
+
+        Warnings
+        --------
+        Might have to copy data, consider using `.as_ndarray_like()` instead.
+
+        Returns
+        -------
+            NumPy array of this buffer (might be a data copy)
+        """
+        return np.asanyarray(self._data)
+
+    def __getitem__(self, key: Any) -> Self:
+        return self.__class__(np.asanyarray(self._data.__getitem__(key)))
+
+    def __setitem__(self, key: Any, value: Any) -> None:
+        if isinstance(value, NDBuffer):
+            value = value._data
+        self._data.__setitem__(key, value)
+
+
+def as_numpy_array_wrapper(
+    func: Callable[[npt.NDArray[Any]], bytes], buf: core.Buffer, prototype: core.BufferPrototype
+) -> core.Buffer:
+    """Converts the input of `func` to a numpy array and the output back to `Buffer`.
+
+    This function is useful when calling a `func` that only support host memory such
+    as `GZip.decode` and `Blosc.decode`. In this case, use this wrapper to convert
+    the input `buf` to a Numpy array and convert the result back into a `Buffer`.
+
+    Parameters
+    ----------
+    func
+        The callable that will be called with the converted `buf` as input.
+        `func` must return bytes, which will be converted into a `Buffer`
+        before returned.
+    buf
+        The buffer that will be converted to a Numpy array before given as
+        input to `func`.
+    prototype
+        The prototype of the output buffer.
+
+    Returns
+    -------
+        The result of `func` converted to a `Buffer`
+    """
+    return prototype.buffer.from_bytes(func(buf.as_numpy_array()))
+
+
+# CPU buffer prototype using numpy arrays
+buffer_prototype = core.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
+default_buffer_prototype = buffer_prototype
diff --git a/src/zarr/buffer/gpu.py b/src/zarr/buffer/gpu.py
new file mode 100644
index 0000000000..d4f626ed69
--- /dev/null
+++ b/src/zarr/buffer/gpu.py
@@ -0,0 +1,212 @@
+from __future__ import annotations
+
+import warnings
+from collections.abc import Iterable
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    cast,
+)
+
+import numpy as np
+import numpy.typing as npt
+
+from zarr.buffer import ArrayLike, Buffer, BufferPrototype, NDArrayLike, NDBuffer
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+    from zarr.common import BytesLike
+
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
+
+class GpuBuffer(Buffer):
+    """A flat contiguous memory block on the GPU
+
+    We use Buffer throughout Zarr to represent a contiguous block of memory.
+
+    A Buffer is backed by a underlying array-like instance that represents
+    the memory. The memory type is unspecified; can be regular host memory,
+    CUDA device memory, or something else. The only requirement is that the
+    array-like instance can be copied/converted to a regular Numpy array
+    (host memory).
+
+    Note
+    ----
+    This buffer is untyped, so all indexing and sizes are in bytes.
+
+    Parameters
+    ----------
+    array_like
+        array-like object that must be 1-dim, contiguous, and byte dtype.
+    """
+
+    def __init__(self, array_like: ArrayLike):
+        if cp is None:
+            raise ImportError("Cannot use GpuBuffer without cupy. Please install cupy.")
+
+        if array_like.ndim != 1:
+            raise ValueError("array_like: only 1-dim allowed")
+        if array_like.dtype != np.dtype("b"):
+            raise ValueError("array_like: only byte dtype allowed")
+
+        if not hasattr(array_like, "__cuda_array_interface__"):
+            # Slow copy based path for arrays that don't support the __cuda_array_interface__
+            # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
+            msg = (
+                "Creating a GpuBuffer with an array that does not support the "
+                "__cuda_array_interface__ for zero-copy transfers, "
+                "falling back to slow copy based path"
+            )
+            warnings.warn(
+                msg,
+                stacklevel=2,
+            )
+        self._data = cp.asarray(array_like)
+
+    @classmethod
+    def create_zero_length(cls) -> Self:
+        """Create an empty buffer with length zero
+
+        Returns
+        -------
+            New empty 0-length buffer
+        """
+        return cls(cp.array([], dtype="b"))
+
+    @classmethod
+    def from_buffer(cls, buffer: Buffer) -> Self:
+        """Create an GpuBuffer given an arbitrary Buffer
+        This will try to be zero-copy if `buffer` is already on the
+        GPU and will trigger a copy if not.
+
+        Returns
+        -------
+            New GpuBuffer constructed from `buffer`
+        """
+        return cls(buffer.as_array_like())
+
+    @classmethod
+    def from_bytes(cls, bytes_like: BytesLike) -> Self:
+        return cls.from_array_like(cp.frombuffer(bytes_like, dtype="b"))
+
+    def as_numpy_array(self) -> npt.NDArray[Any]:
+        return cast(npt.NDArray[Any], cp.asnumpy(self._data))
+
+    def __add__(self, other: Buffer) -> Self:
+        other_array = other.as_array_like()
+        assert other_array.dtype == np.dtype("b")
+        gpu_other = GpuBuffer(other_array)
+        gpu_other_array = gpu_other.as_array_like()
+        return self.__class__(
+            cp.concatenate((cp.asanyarray(self._data), cp.asanyarray(gpu_other_array)))
+        )
+
+
+class GpuNDBuffer(NDBuffer):
+    """A n-dimensional memory block on the GPU
+
+    We use NDBuffer throughout Zarr to represent a n-dimensional memory block.
+
+    A NDBuffer is backed by a underlying ndarray-like instance that represents
+    the memory. The memory type is unspecified; can be regular host memory,
+    CUDA device memory, or something else. The only requirement is that the
+    ndarray-like instance can be copied/converted to a regular Numpy array
+    (host memory).
+
+    Note
+    ----
+    The two buffer classes Buffer and NDBuffer are very similar. In fact, Buffer
+    is a special case of NDBuffer where dim=1, stride=1, and dtype="b". However,
+    in order to use Python's type system to differentiate between the contiguous
+    Buffer and the n-dim (non-contiguous) NDBuffer, we keep the definition of the
+    two classes separate.
+
+    Parameters
+    ----------
+    ndarray_like
+        ndarray-like object that is convertible to a regular Numpy array.
+    """
+
+    def __init__(self, array: NDArrayLike):
+        if cp is None:
+            raise ImportError("Cannot use GpuNDBuffer without cupy. Please install cupy.")
+
+        # assert array.ndim > 0
+        assert array.dtype != object
+        self._data = array
+
+        if not hasattr(array, "__cuda_array_interface__"):
+            # Slow copy based path for arrays that don't support the __cuda_array_interface__
+            # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
+            msg = (
+                "Creating a GpuNDBuffer with an array that does not support the "
+                "__cuda_array_interface__ for zero-copy transfers, "
+                "falling back to slow copy based path"
+            )
+            warnings.warn(
+                msg,
+                stacklevel=2,
+            )
+        self._data = cp.asarray(array)
+
+    @classmethod
+    def create(
+        cls,
+        *,
+        shape: Iterable[int],
+        dtype: npt.DTypeLike,
+        order: Literal["C", "F"] = "C",
+        fill_value: Any | None = None,
+    ) -> Self:
+        ret = cls(cp.empty(shape=tuple(shape), dtype=dtype, order=order))
+        if fill_value is not None:
+            ret.fill(fill_value)
+        return ret
+
+    @classmethod
+    def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
+        """Create a new buffer of Numpy array-like object
+
+        Parameters
+        ----------
+        array_like
+            Object that can be coerced into a Numpy array
+
+        Returns
+        -------
+            New buffer representing `array_like`
+        """
+        return cls(cp.asarray(array_like))
+
+    def as_numpy_array(self) -> npt.NDArray[Any]:
+        """Returns the buffer as a NumPy array (host memory).
+
+        Warning
+        -------
+        Might have to copy data, consider using `.as_ndarray_like()` instead.
+
+        Returns
+        -------
+            NumPy array of this buffer (might be a data copy)
+        """
+        return cast(npt.NDArray[Any], cp.asnumpy(self._data))
+
+    def __getitem__(self, key: Any) -> Self:
+        return self.__class__(self._data.__getitem__(key))
+
+    def __setitem__(self, key: Any, value: Any) -> None:
+        if isinstance(value, GpuNDBuffer):
+            value = value._data
+        elif isinstance(value, NDBuffer):
+            gpu_value = GpuNDBuffer(value.as_ndarray_like())
+            value = gpu_value._data
+        self._data.__setitem__(key, value)
+
+
+buffer_prototype = BufferPrototype(buffer=GpuBuffer, nd_buffer=GpuNDBuffer)
diff --git a/src/zarr/codecs/blosc.py b/src/zarr/codecs/blosc.py
index df1976d4c1..f6565bb9a0 100644
--- a/src/zarr/codecs/blosc.py
+++ b/src/zarr/codecs/blosc.py
@@ -10,7 +10,8 @@
 
 from zarr.abc.codec import BytesBytesCodec
 from zarr.array_spec import ArraySpec
-from zarr.buffer import Buffer, as_numpy_array_wrapper
+from zarr.buffer import Buffer
+from zarr.buffer.cpu import as_numpy_array_wrapper
 from zarr.codecs.registry import register_codec
 from zarr.common import JSON, parse_enum, parse_named_configuration, to_thread
 
diff --git a/src/zarr/codecs/gzip.py b/src/zarr/codecs/gzip.py
index 0ad97c1207..a9ac6f6f30 100644
--- a/src/zarr/codecs/gzip.py
+++ b/src/zarr/codecs/gzip.py
@@ -7,7 +7,8 @@
 
 from zarr.abc.codec import BytesBytesCodec
 from zarr.array_spec import ArraySpec
-from zarr.buffer import Buffer, as_numpy_array_wrapper
+from zarr.buffer import Buffer
+from zarr.buffer.cpu import as_numpy_array_wrapper
 from zarr.codecs.registry import register_codec
 from zarr.common import JSON, parse_named_configuration, to_thread
 
diff --git a/src/zarr/codecs/zstd.py b/src/zarr/codecs/zstd.py
index 4c5afba00b..14b36fd8cc 100644
--- a/src/zarr/codecs/zstd.py
+++ b/src/zarr/codecs/zstd.py
@@ -8,7 +8,8 @@
 
 from zarr.abc.codec import BytesBytesCodec
 from zarr.array_spec import ArraySpec
-from zarr.buffer import Buffer, as_numpy_array_wrapper
+from zarr.buffer import Buffer
+from zarr.buffer.cpu import as_numpy_array_wrapper
 from zarr.codecs.registry import register_codec
 from zarr.common import JSON, parse_named_configuration, to_thread
 
diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py
index 3e9bd2a513..1cd006098d 100644
--- a/src/zarr/store/memory.py
+++ b/src/zarr/store/memory.py
@@ -3,7 +3,8 @@
 from collections.abc import AsyncGenerator, MutableMapping
 
 from zarr.abc.store import Store
-from zarr.buffer import Buffer, BufferPrototype, GpuBuffer
+from zarr.buffer import Buffer, BufferPrototype
+from zarr.buffer.gpu import GpuBuffer
 from zarr.common import OpenMode, concurrent_map
 from zarr.store.utils import _normalize_interval_index
 
diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py
index 08118799b0..38e724bc51 100644
--- a/tests/v3/test_buffer.py
+++ b/tests/v3/test_buffer.py
@@ -13,8 +13,8 @@
     Buffer,
     BufferPrototype,
     NDArrayLike,
-    NDBuffer,
-    gpu_buffer_prototype,
+    cpu,
+    gpu,
 )
 from zarr.codecs.blosc import BloscCodec
 from zarr.codecs.bytes import BytesCodec
@@ -38,11 +38,11 @@ class MyNDArrayLike(np.ndarray):
     """An example of a ndarray-like class"""
 
 
-class MyBuffer(Buffer):
+class MyBuffer(cpu.Buffer):
     """Example of a custom Buffer that handles ArrayLike"""
 
 
-class MyNDBuffer(NDBuffer):
+class MyNDBuffer(cpu.NDBuffer):
     """Example of a custom NDBuffer that handles MyNDArrayLike"""
 
     @classmethod
@@ -136,9 +136,9 @@ async def test_async_array_gpu_prototype():
     await a.setitem(
         selection=(slice(1, 4), slice(3, 6)),
         value=cp.ones((3, 3)),
-        prototype=gpu_buffer_prototype,
+        prototype=gpu.buffer_prototype,
     )
-    got = await a.getitem(selection=(slice(0, 9), slice(0, 9)), prototype=gpu_buffer_prototype)
+    got = await a.getitem(selection=(slice(0, 9), slice(0, 9)), prototype=gpu.buffer_prototype)
     assert isinstance(got, cp.ndarray)
     assert cp.array_equal(expect, got)
 
@@ -199,8 +199,8 @@ async def test_codecs_use_of_gpu_prototype():
     await a.setitem(
         selection=(slice(0, 10), slice(0, 10)),
         value=expect[:],
-        prototype=gpu_buffer_prototype,
+        prototype=gpu.buffer_prototype,
     )
-    got = await a.getitem(selection=(slice(0, 10), slice(0, 10)), prototype=gpu_buffer_prototype)
+    got = await a.getitem(selection=(slice(0, 10), slice(0, 10)), prototype=gpu.buffer_prototype)
     assert isinstance(got, cp.ndarray)
     assert cp.array_equal(expect, got)

From 4e1809885f448e951fbf2773d00537320e7ee6a7 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:21:52 -0700
Subject: [PATCH 13/46] Templating store tests on Buffer type

---
 pyproject.toml                     |  2 +-
 src/zarr/buffer/gpu.py             | 36 +++++++++++++++++-------------
 src/zarr/store/memory.py           |  9 ++++----
 src/zarr/testing/store.py          | 24 +++++++++++---------
 tests/v3/test_store/test_local.py  |  7 +++---
 tests/v3/test_store/test_memory.py | 10 +++++----
 tests/v3/test_store/test_remote.py |  9 ++++----
 7 files changed, 53 insertions(+), 44 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 40f25a9c96..6fb1729314 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -223,5 +223,5 @@ filterwarnings = [
     "error:::zarr.*",
     "ignore:PY_SSIZE_T_CLEAN will be required.*:DeprecationWarning",
     "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning",
-    "ignore:Creating a Gpu*:UserWarning",
+    "ignore:Creating a zarr.buffer.gpu.*:UserWarning",
 ]
diff --git a/src/zarr/buffer/gpu.py b/src/zarr/buffer/gpu.py
index d4f626ed69..97461e36ab 100644
--- a/src/zarr/buffer/gpu.py
+++ b/src/zarr/buffer/gpu.py
@@ -12,7 +12,7 @@
 import numpy as np
 import numpy.typing as npt
 
-from zarr.buffer import ArrayLike, Buffer, BufferPrototype, NDArrayLike, NDBuffer
+from zarr.buffer import ArrayLike, BufferPrototype, NDArrayLike, core
 
 if TYPE_CHECKING:
     from typing_extensions import Self
@@ -25,7 +25,7 @@
     cp = None
 
 
-class GpuBuffer(Buffer):
+class Buffer(core.Buffer):
     """A flat contiguous memory block on the GPU
 
     We use Buffer throughout Zarr to represent a contiguous block of memory.
@@ -48,7 +48,9 @@ class GpuBuffer(Buffer):
 
     def __init__(self, array_like: ArrayLike):
         if cp is None:
-            raise ImportError("Cannot use GpuBuffer without cupy. Please install cupy.")
+            raise ImportError(
+                "Cannot use zarr.buffer.gpu.Buffer without cupy. Please install cupy."
+            )
 
         if array_like.ndim != 1:
             raise ValueError("array_like: only 1-dim allowed")
@@ -59,7 +61,7 @@ def __init__(self, array_like: ArrayLike):
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
             # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
             msg = (
-                "Creating a GpuBuffer with an array that does not support the "
+                "Creating a zarr.buffer.gpu.Buffer with an array that does not support the "
                 "__cuda_array_interface__ for zero-copy transfers, "
                 "falling back to slow copy based path"
             )
@@ -80,14 +82,14 @@ def create_zero_length(cls) -> Self:
         return cls(cp.array([], dtype="b"))
 
     @classmethod
-    def from_buffer(cls, buffer: Buffer) -> Self:
-        """Create an GpuBuffer given an arbitrary Buffer
+    def from_buffer(cls, buffer: core.Buffer) -> Self:
+        """Create an GPU Buffer given an arbitrary Buffer
         This will try to be zero-copy if `buffer` is already on the
         GPU and will trigger a copy if not.
 
         Returns
         -------
-            New GpuBuffer constructed from `buffer`
+            New GPU Buffer constructed from `buffer`
         """
         return cls(buffer.as_array_like())
 
@@ -98,17 +100,17 @@ def from_bytes(cls, bytes_like: BytesLike) -> Self:
     def as_numpy_array(self) -> npt.NDArray[Any]:
         return cast(npt.NDArray[Any], cp.asnumpy(self._data))
 
-    def __add__(self, other: Buffer) -> Self:
+    def __add__(self, other: core.Buffer) -> Self:
         other_array = other.as_array_like()
         assert other_array.dtype == np.dtype("b")
-        gpu_other = GpuBuffer(other_array)
+        gpu_other = Buffer(other_array)
         gpu_other_array = gpu_other.as_array_like()
         return self.__class__(
             cp.concatenate((cp.asanyarray(self._data), cp.asanyarray(gpu_other_array)))
         )
 
 
-class GpuNDBuffer(NDBuffer):
+class NDBuffer(core.NDBuffer):
     """A n-dimensional memory block on the GPU
 
     We use NDBuffer throughout Zarr to represent a n-dimensional memory block.
@@ -135,7 +137,9 @@ class GpuNDBuffer(NDBuffer):
 
     def __init__(self, array: NDArrayLike):
         if cp is None:
-            raise ImportError("Cannot use GpuNDBuffer without cupy. Please install cupy.")
+            raise ImportError(
+                "Cannot use zarr.buffer.gpu.NDBuffer without cupy. Please install cupy."
+            )
 
         # assert array.ndim > 0
         assert array.dtype != object
@@ -145,7 +149,7 @@ def __init__(self, array: NDArrayLike):
             # Slow copy based path for arrays that don't support the __cuda_array_interface__
             # TODO: Add a fast zero-copy path for arrays that support the dlpack protocol
             msg = (
-                "Creating a GpuNDBuffer with an array that does not support the "
+                "Creating a zarr.buffer.gpu.NDBuffer with an array that does not support the "
                 "__cuda_array_interface__ for zero-copy transfers, "
                 "falling back to slow copy based path"
             )
@@ -201,12 +205,12 @@ def __getitem__(self, key: Any) -> Self:
         return self.__class__(self._data.__getitem__(key))
 
     def __setitem__(self, key: Any, value: Any) -> None:
-        if isinstance(value, GpuNDBuffer):
+        if isinstance(value, NDBuffer):
             value = value._data
-        elif isinstance(value, NDBuffer):
-            gpu_value = GpuNDBuffer(value.as_ndarray_like())
+        elif isinstance(value, core.NDBuffer):
+            gpu_value = NDBuffer(value.as_ndarray_like())
             value = gpu_value._data
         self._data.__setitem__(key, value)
 
 
-buffer_prototype = BufferPrototype(buffer=GpuBuffer, nd_buffer=GpuNDBuffer)
+buffer_prototype = BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
diff --git a/src/zarr/store/memory.py b/src/zarr/store/memory.py
index 1cd006098d..682c8cebf3 100644
--- a/src/zarr/store/memory.py
+++ b/src/zarr/store/memory.py
@@ -3,8 +3,7 @@
 from collections.abc import AsyncGenerator, MutableMapping
 
 from zarr.abc.store import Store
-from zarr.buffer import Buffer, BufferPrototype
-from zarr.buffer.gpu import GpuBuffer
+from zarr.buffer import Buffer, BufferPrototype, gpu
 from zarr.common import OpenMode, concurrent_map
 from zarr.store.utils import _normalize_interval_index
 
@@ -118,7 +117,7 @@ def __init__(
     ):
         super().__init__(mode=mode)
         if store_dict:
-            self._store_dict = {k: GpuBuffer.from_buffer(store_dict[k]) for k in iter(store_dict)}
+            self._store_dict = {k: gpu.Buffer.from_buffer(store_dict[k]) for k in iter(store_dict)}
 
     def __str__(self) -> str:
         return f"gpumemory://{id(self._store_dict)}"
@@ -132,6 +131,6 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None
         if not isinstance(value, Buffer):
             raise TypeError(f"Expected Buffer. Got {type(value)}.")
 
-        # Convert to GpuBuffer
-        gpu_value = value if isinstance(value, GpuBuffer) else GpuBuffer.from_buffer(value)
+        # Convert to gpu.Buffer
+        gpu_value = value if isinstance(value, gpu.Buffer) else gpu.Buffer.from_buffer(value)
         await super().set(key, gpu_value, byte_range=byte_range)
diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py
index 9c37ce0434..ab289efd32 100644
--- a/src/zarr/testing/store.py
+++ b/src/zarr/testing/store.py
@@ -8,10 +8,12 @@
 from zarr.testing.utils import assert_bytes_equal
 
 S = TypeVar("S", bound=Store)
+B = TypeVar("B", bound=Buffer)
 
 
-class StoreTests(Generic[S]):
+class StoreTests(Generic[S, B]):
     store_cls: type[S]
+    buffer_cls: type[B]
 
     def set(self, store: S, key: str, value: Buffer) -> None:
         """
@@ -62,7 +64,7 @@ async def test_not_writable_store_raises(self, store_kwargs: dict[str, Any]) ->
 
         # set
         with pytest.raises(ValueError):
-            await store.set("foo", Buffer.from_bytes(b"bar"))
+            await store.set("foo", self.buffer_cls.from_bytes(b"bar"))
 
         # delete
         with pytest.raises(ValueError):
@@ -89,7 +91,7 @@ async def test_get(
         """
         Ensure that data can be read from the store using the store.get method.
         """
-        data_buf = Buffer.from_bytes(data)
+        data_buf = self.buffer_cls.from_bytes(data)
         self.set(store, key, data_buf)
         observed = await store.get(key, prototype=default_buffer_prototype, byte_range=byte_range)
         start, length = _normalize_interval_index(data_buf, interval=byte_range)
@@ -103,7 +105,7 @@ async def test_set(self, store: S, key: str, data: bytes) -> None:
         Ensure that data can be written to the store using the store.set method.
         """
         assert store.writeable
-        data_buf = Buffer.from_bytes(data)
+        data_buf = self.buffer_cls.from_bytes(data)
         await store.set(key, data_buf)
         observed = self.get(store, key)
         assert_bytes_equal(observed, data_buf)
@@ -122,7 +124,7 @@ async def test_get_partial_values(
     ) -> None:
         # put all of the data
         for key, _ in key_ranges:
-            self.set(store, key, Buffer.from_bytes(bytes(key, encoding="utf-8")))
+            self.set(store, key, self.buffer_cls.from_bytes(bytes(key, encoding="utf-8")))
 
         # read back just part of it
         observed_maybe = await store.get_partial_values(
@@ -148,18 +150,18 @@ async def test_get_partial_values(
 
     async def test_exists(self, store: S) -> None:
         assert not await store.exists("foo")
-        await store.set("foo/zarr.json", Buffer.from_bytes(b"bar"))
+        await store.set("foo/zarr.json", self.buffer_cls.from_bytes(b"bar"))
         assert await store.exists("foo/zarr.json")
 
     async def test_delete(self, store: S) -> None:
-        await store.set("foo/zarr.json", Buffer.from_bytes(b"bar"))
+        await store.set("foo/zarr.json", self.buffer_cls.from_bytes(b"bar"))
         assert await store.exists("foo/zarr.json")
         await store.delete("foo/zarr.json")
         assert not await store.exists("foo/zarr.json")
 
     async def test_list(self, store: S) -> None:
         assert [k async for k in store.list()] == []
-        await store.set("foo/zarr.json", Buffer.from_bytes(b"bar"))
+        await store.set("foo/zarr.json", self.buffer_cls.from_bytes(b"bar"))
         keys = [k async for k in store.list()]
         assert keys == ["foo/zarr.json"], keys
 
@@ -168,7 +170,7 @@ async def test_list(self, store: S) -> None:
             key = f"foo/c/{i}"
             expected.append(key)
             await store.set(
-                f"foo/c/{i}", Buffer.from_bytes(i.to_bytes(length=3, byteorder="little"))
+                f"foo/c/{i}", self.buffer_cls.from_bytes(i.to_bytes(length=3, byteorder="little"))
             )
 
     @pytest.mark.xfail
@@ -180,8 +182,8 @@ async def test_list_dir(self, store: S) -> None:
         out = [k async for k in store.list_dir("")]
         assert out == []
         assert [k async for k in store.list_dir("foo")] == []
-        await store.set("foo/zarr.json", Buffer.from_bytes(b"bar"))
-        await store.set("foo/c/1", Buffer.from_bytes(b"\x01"))
+        await store.set("foo/zarr.json", self.buffer_cls.from_bytes(b"bar"))
+        await store.set("foo/c/1", self.buffer_cls.from_bytes(b"\x01"))
 
         keys = [k async for k in store.list_dir("foo")]
         assert set(keys) == set(["zarr.json", "c"]), keys
diff --git a/tests/v3/test_store/test_local.py b/tests/v3/test_store/test_local.py
index 191a137d46..aa078fd02e 100644
--- a/tests/v3/test_store/test_local.py
+++ b/tests/v3/test_store/test_local.py
@@ -2,16 +2,17 @@
 
 import pytest
 
-from zarr.buffer import Buffer
+from zarr.buffer import Buffer, cpu
 from zarr.store.local import LocalStore
 from zarr.testing.store import StoreTests
 
 
-class TestLocalStore(StoreTests[LocalStore]):
+class TestLocalStore(StoreTests[LocalStore, cpu.Buffer]):
     store_cls = LocalStore
+    buffer_cls = cpu.Buffer
 
     def get(self, store: LocalStore, key: str) -> Buffer:
-        return Buffer.from_bytes((store.root / key).read_bytes())
+        return self.buffer_cls.from_bytes((store.root / key).read_bytes())
 
     def set(self, store: LocalStore, key: str, value: Buffer) -> None:
         parent = (store.root / key).parent
diff --git a/tests/v3/test_store/test_memory.py b/tests/v3/test_store/test_memory.py
index 8d7c9f5499..fc8e813579 100644
--- a/tests/v3/test_store/test_memory.py
+++ b/tests/v3/test_store/test_memory.py
@@ -2,7 +2,7 @@
 
 import pytest
 
-from zarr.buffer import Buffer, GpuBuffer
+from zarr.buffer import Buffer, cpu, gpu
 from zarr.store.memory import GpuMemoryStore, MemoryStore
 from zarr.testing.store import StoreTests
 
@@ -12,8 +12,9 @@
     cp = None
 
 
-class TestMemoryStore(StoreTests[MemoryStore]):
+class TestMemoryStore(StoreTests[MemoryStore, cpu.Buffer]):
     store_cls = MemoryStore
+    buffer_cls = cpu.Buffer
 
     def set(self, store: MemoryStore, key: str, value: Buffer) -> None:
         store._store_dict[key] = value
@@ -46,8 +47,9 @@ def test_list_prefix(self, store: MemoryStore) -> None:
 
 
 @pytest.mark.skipif(cp is None, reason="requires cupy")
-class TestGpuMemoryStore(StoreTests[GpuMemoryStore]):
+class TestGpuMemoryStore(StoreTests[GpuMemoryStore, gpu.Buffer]):
     store_cls = GpuMemoryStore
+    buffer_cls = gpu.Buffer
 
     def set(self, store: GpuMemoryStore, key: str, value: Buffer) -> None:
         store._store_dict[key] = value
@@ -60,7 +62,7 @@ def store_kwargs(self, request) -> dict[str, str | None | dict[str, Buffer]]:
         return {"store_dict": request.param, "mode": "w"}
 
     @pytest.fixture(scope="function")
-    def store(self, store_kwargs: str | None | dict[str, GpuBuffer]) -> GpuMemoryStore:
+    def store(self, store_kwargs: str | None | dict[str, gpu.Buffer]) -> GpuMemoryStore:
         return self.store_cls(**store_kwargs)
 
     def test_store_repr(self, store: GpuMemoryStore) -> None:
diff --git a/tests/v3/test_store/test_remote.py b/tests/v3/test_store/test_remote.py
index 0dc399be42..e8b920e29c 100644
--- a/tests/v3/test_store/test_remote.py
+++ b/tests/v3/test_store/test_remote.py
@@ -4,7 +4,7 @@
 import pytest
 from upath import UPath
 
-from zarr.buffer import Buffer, default_buffer_prototype
+from zarr.buffer import Buffer, cpu, default_buffer_prototype
 from zarr.store import RemoteStore
 from zarr.sync import sync
 from zarr.testing.store import StoreTests
@@ -86,7 +86,7 @@ async def test_basic():
     assert not await alist(store.list())
     assert not await store.exists("foo")
     data = b"hello"
-    await store.set("foo", Buffer.from_bytes(data))
+    await store.set("foo", cpu.Buffer.from_bytes(data))
     assert await store.exists("foo")
     assert (await store.get("foo", prototype=default_buffer_prototype)).to_bytes() == data
     out = await store.get_partial_values(
@@ -95,8 +95,9 @@ async def test_basic():
     assert out[0].to_bytes() == data[1:]
 
 
-class TestRemoteStoreS3(StoreTests[RemoteStore]):
+class TestRemoteStoreS3(StoreTests[RemoteStore, cpu.Buffer]):
     store_cls = RemoteStore
+    buffer_cls = cpu.Buffer
 
     @pytest.fixture(scope="function", params=("use_upath", "use_str"))
     def store_kwargs(self, request) -> dict[str, str | bool]:
@@ -129,7 +130,7 @@ def get(self, store: RemoteStore, key: str) -> Buffer:
             anon=store._fs.anon,
             endpoint_url=store._fs.endpoint_url,
         )
-        return Buffer.from_bytes(fs.cat(f"{store.path}/{key}"))
+        return self.buffer_cls.from_bytes(fs.cat(f"{store.path}/{key}"))
 
     def set(self, store: RemoteStore, key: str, value: Buffer) -> None:
         #  make a new, synchronous instance of the filesystem because this test is run in sync code

From 35948d48365b45de3bc3c9f6ee7f0b703bc522ef Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:49:32 -0700
Subject: [PATCH 14/46] Changing imports to prevent circular dependencies

---
 src/zarr/buffer/cpu.py | 3 ++-
 src/zarr/buffer/gpu.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/zarr/buffer/cpu.py b/src/zarr/buffer/cpu.py
index d791d8e696..6a44f30b70 100644
--- a/src/zarr/buffer/cpu.py
+++ b/src/zarr/buffer/cpu.py
@@ -10,7 +10,8 @@
 import numpy as np
 import numpy.typing as npt
 
-from zarr.buffer import ArrayLike, NDArrayLike, core
+from zarr.buffer import core
+from zarr.buffer.core import ArrayLike, NDArrayLike
 
 if TYPE_CHECKING:
     from typing_extensions import Self
diff --git a/src/zarr/buffer/gpu.py b/src/zarr/buffer/gpu.py
index 97461e36ab..25b9a80297 100644
--- a/src/zarr/buffer/gpu.py
+++ b/src/zarr/buffer/gpu.py
@@ -12,7 +12,8 @@
 import numpy as np
 import numpy.typing as npt
 
-from zarr.buffer import ArrayLike, BufferPrototype, NDArrayLike, core
+from zarr.buffer import core
+from zarr.buffer.core import ArrayLike, BufferPrototype, NDArrayLike
 
 if TYPE_CHECKING:
     from typing_extensions import Self

From bd2a20bcd70f9717672ea19c06684887182b232c Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:29:37 -0700
Subject: [PATCH 15/46] Fixing unsafe calls to Buffer abstract methods in
 metadata.py and group.py

---
 src/zarr/group.py    | 8 ++++----
 src/zarr/metadata.py | 8 +++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/zarr/group.py b/src/zarr/group.py
index e6e2ac183f..de73c6061a 100644
--- a/src/zarr/group.py
+++ b/src/zarr/group.py
@@ -14,7 +14,7 @@
 from zarr.abc.store import set_or_delete
 from zarr.array import Array, AsyncArray
 from zarr.attributes import Attributes
-from zarr.buffer import Buffer
+from zarr.buffer import Buffer, default_buffer_prototype
 from zarr.chunk_key_encodings import ChunkKeyEncoding
 from zarr.common import (
     JSON,
@@ -83,16 +83,16 @@ def to_buffer_dict(self) -> dict[str, Buffer]:
         json_indent = config.get("json_indent")
         if self.zarr_format == 3:
             return {
-                ZARR_JSON: Buffer.from_bytes(
+                ZARR_JSON: default_buffer_prototype.buffer.from_bytes(
                     json.dumps(self.to_dict(), indent=json_indent).encode()
                 )
             }
         else:
             return {
-                ZGROUP_JSON: Buffer.from_bytes(
+                ZGROUP_JSON: default_buffer_prototype.buffer.from_bytes(
                     json.dumps({"zarr_format": self.zarr_format}, indent=json_indent).encode()
                 ),
-                ZATTRS_JSON: Buffer.from_bytes(
+                ZATTRS_JSON: default_buffer_prototype.buffer.from_bytes(
                     json.dumps(self.attributes, indent=json_indent).encode()
                 ),
             }
diff --git a/src/zarr/metadata.py b/src/zarr/metadata.py
index ef7edbd560..c69fc708c6 100644
--- a/src/zarr/metadata.py
+++ b/src/zarr/metadata.py
@@ -269,7 +269,7 @@ def _json_convert(o: np.dtype[Any] | Enum | Codec) -> str | dict[str, Any]:
 
         json_indent = config.get("json_indent")
         return {
-            ZARR_JSON: Buffer.from_bytes(
+            ZARR_JSON: default_buffer_prototype.buffer.from_bytes(
                 json.dumps(self.to_dict(), default=_json_convert, indent=json_indent).encode()
             )
         }
@@ -386,10 +386,12 @@ def _json_convert(
         assert isinstance(zattrs_dict, dict)
         json_indent = config.get("json_indent")
         return {
-            ZARRAY_JSON: Buffer.from_bytes(
+            ZARRAY_JSON: default_buffer_prototype.buffer.from_bytes(
                 json.dumps(zarray_dict, default=_json_convert, indent=json_indent).encode()
             ),
-            ZATTRS_JSON: Buffer.from_bytes(json.dumps(zattrs_dict, indent=json_indent).encode()),
+            ZATTRS_JSON: default_buffer_prototype.buffer.from_bytes(
+                json.dumps(zattrs_dict, indent=json_indent).encode()
+            ),
         }
 
     @classmethod

From 828401f5b863277e876dc387e25e6723b0e82d77 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 15 Jul 2024 14:50:07 -0700
Subject: [PATCH 16/46] Preventing calls to abstract classmethods of Buffer and
 NDBuffer

---
 src/zarr/buffer/core.py | 35 ++++++++++++++++++++++++++++++-----
 tests/v3/test_group.py  | 11 ++++++++---
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/zarr/buffer/core.py b/src/zarr/buffer/core.py
index 755a3710d0..3f23b6d3da 100644
--- a/src/zarr/buffer/core.py
+++ b/src/zarr/buffer/core.py
@@ -10,6 +10,7 @@
     NamedTuple,
     Protocol,
     SupportsIndex,
+    cast,
     runtime_checkable,
 )
 
@@ -144,7 +145,11 @@ def create_zero_length(cls) -> Self:
         -------
             New empty 0-length buffer
         """
-        ...
+        if cls is Buffer:
+            raise NotImplementedError("Cannot call abstract method on the abstract class 'Buffer'")
+        return cls(
+            cast(ArrayLike, None)
+        )  # This line will never be reached, but it satisfies the type checker
 
     @classmethod
     def from_array_like(cls, array_like: ArrayLike) -> Self:
@@ -188,7 +193,11 @@ def from_buffer(cls, buffer: Buffer) -> Self:
         Subclasses of `Buffer` must override this method to implement
         more optimal conversions that avoid copies where possible
         """
-        ...
+        if cls is Buffer:
+            raise NotImplementedError("Cannot call abstract method on the abstract class 'Buffer'")
+        return cls(
+            cast(ArrayLike, None)
+        )  # This line will never be reached, but it satisfies the type checker
 
     @classmethod
     @abstractmethod
@@ -204,7 +213,11 @@ def from_bytes(cls, bytes_like: BytesLike) -> Self:
         -------
             New buffer representing `bytes_like`
         """
-        ...
+        if cls is Buffer:
+            raise NotImplementedError("Cannot call abstract method on the abstract class 'Buffer'")
+        return cls(
+            cast(ArrayLike, None)
+        )  # This line will never be reached, but it satisfies the type checker
 
     def as_array_like(self) -> ArrayLike:
         """Returns the underlying array (host or device memory) of this buffer
@@ -325,7 +338,13 @@ def create(
         A subclass can overwrite this method to create a ndarray-like object
         other then the default Numpy array.
         """
-        ...
+        if cls is NDBuffer:
+            raise NotImplementedError(
+                "Cannot call abstract method on the abstract class 'NDBuffer'"
+            )
+        return cls(
+            cast(NDArrayLike, None)
+        )  # This line will never be reached, but it satisfies the type checker
 
     @classmethod
     def from_ndarray_like(cls, ndarray_like: NDArrayLike) -> Self:
@@ -356,7 +375,13 @@ def from_numpy_array(cls, array_like: npt.ArrayLike) -> Self:
         -------
             New buffer representing `array_like`
         """
-        ...
+        if cls is NDBuffer:
+            raise NotImplementedError(
+                "Cannot call abstract method on the abstract class 'NDBuffer'"
+            )
+        return cls(
+            cast(NDArrayLike, None)
+        )  # This line will never be reached, but it satisfies the type checker
 
     def as_ndarray_like(self) -> NDArrayLike:
         """Returns the underlying array (host or device memory) of this buffer
diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py
index e11af748b3..3622c914c1 100644
--- a/tests/v3/test_group.py
+++ b/tests/v3/test_group.py
@@ -3,7 +3,7 @@
 from typing import TYPE_CHECKING, Any
 
 from zarr.array import AsyncArray
-from zarr.buffer import Buffer
+from zarr.buffer import default_buffer_prototype
 from zarr.store.core import make_store_path
 from zarr.sync import sync
 
@@ -45,11 +45,16 @@ def test_group_children(store: MemoryStore | LocalStore) -> None:
 
     # add an extra object to the domain of the group.
     # the list of children should ignore this object.
-    sync(store.set(f"{path}/extra_object-1", Buffer.from_bytes(b"000000")))
+    sync(store.set(f"{path}/extra_object-1", default_buffer_prototype.buffer.from_bytes(b"000000")))
     # add an extra object under a directory-like prefix in the domain of the group.
     # this creates a directory with a random key in it
     # this should not show up as a member
-    sync(store.set(f"{path}/extra_directory/extra_object-2", Buffer.from_bytes(b"000000")))
+    sync(
+        store.set(
+            f"{path}/extra_directory/extra_object-2",
+            default_buffer_prototype.buffer.from_bytes(b"000000"),
+        )
+    )
     members_observed = group.members
     # members are not guaranteed to be ordered, so sort before comparing
     assert sorted(dict(members_observed)) == sorted(members_expected)

From 02a6e9d68306e7dd0ef674812b74774c1d2c362b Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 00:06:57 -0700
Subject: [PATCH 17/46] Fixing some more unsafe usage of Buffer abstract class

---
 src/zarr/codecs/_v2.py      | 10 +++++-----
 src/zarr/codecs/sharding.py |  8 +++++---
 tests/v3/test_indexing.py   | 18 ++++++++++++------
 3 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
index c43a087a94..cf42369b4f 100644
--- a/src/zarr/codecs/_v2.py
+++ b/src/zarr/codecs/_v2.py
@@ -7,7 +7,7 @@
 
 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec
 from zarr.array_spec import ArraySpec
-from zarr.buffer import Buffer, NDBuffer
+from zarr.buffer import Buffer, NDBuffer, cpu
 from zarr.common import JSON, to_thread
 
 
@@ -34,7 +34,7 @@ async def _decode_single(
         if str(chunk_numpy_array.dtype) != chunk_spec.dtype:
             chunk_numpy_array = chunk_numpy_array.view(chunk_spec.dtype)
 
-        return NDBuffer.from_numpy_array(chunk_numpy_array)
+        return cpu.NDBuffer.from_numpy_array(chunk_numpy_array)
 
     async def _encode_single(
         self,
@@ -55,7 +55,7 @@ async def _encode_single(
         else:
             encoded_chunk_bytes = ensure_bytes(chunk_numpy_array)
 
-        return Buffer.from_bytes(encoded_chunk_bytes)
+        return cpu.Buffer.from_bytes(encoded_chunk_bytes)
 
     def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         raise NotImplementedError
@@ -86,7 +86,7 @@ async def _decode_single(
                 order=chunk_spec.order,
             )
 
-        return NDBuffer.from_ndarray_like(chunk_ndarray)
+        return cpu.NDBuffer.from_ndarray_like(chunk_ndarray)
 
     async def _encode_single(
         self,
@@ -99,7 +99,7 @@ async def _encode_single(
             filter = numcodecs.get_codec(filter_metadata)
             chunk_ndarray = await to_thread(filter.encode, chunk_ndarray)
 
-        return NDBuffer.from_ndarray_like(chunk_ndarray)
+        return cpu.NDBuffer.from_ndarray_like(chunk_ndarray)
 
     def compute_encoded_size(self, _input_byte_length: int, _chunk_spec: ArraySpec) -> int:
         raise NotImplementedError
diff --git a/src/zarr/codecs/sharding.py b/src/zarr/codecs/sharding.py
index e3ef664b94..bed085692a 100644
--- a/src/zarr/codecs/sharding.py
+++ b/src/zarr/codecs/sharding.py
@@ -172,7 +172,7 @@ async def from_bytes(
     def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardReader:
         index = _ShardIndex.create_empty(chunks_per_shard)
         obj = cls()
-        obj.buf = Buffer.create_zero_length()
+        obj.buf = default_buffer_prototype.buffer.create_zero_length()
         obj.index = index
         return obj
 
@@ -217,7 +217,7 @@ def merge_with_morton_order(
     @classmethod
     def create_empty(cls, chunks_per_shard: ChunkCoords) -> _ShardBuilder:
         obj = cls()
-        obj.buf = Buffer.create_zero_length()
+        obj.buf = default_buffer_prototype.buffer.create_zero_length()
         obj.index = _ShardIndex.create_empty(chunks_per_shard)
         return obj
 
@@ -607,7 +607,9 @@ async def _encode_shard_index(self, index: _ShardIndex) -> Buffer:
                 await BatchedCodecPipeline.from_list(self.index_codecs).encode(
                     [
                         (
-                            NDBuffer.from_numpy_array(index.offsets_and_lengths),
+                            default_buffer_prototype.nd_buffer.from_numpy_array(
+                                index.offsets_and_lengths
+                            ),
                             self._get_index_chunk_spec(index.chunks_per_shard),
                         )
                     ],
diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py
index c84c091089..46ce5981d4 100644
--- a/tests/v3/test_indexing.py
+++ b/tests/v3/test_indexing.py
@@ -12,7 +12,7 @@
 
 import zarr
 from zarr.abc.store import Store
-from zarr.buffer import BufferPrototype, NDBuffer
+from zarr.buffer import BufferPrototype, default_buffer_prototype
 from zarr.common import ChunkCoords
 from zarr.indexing import (
     make_slice_selection,
@@ -133,7 +133,7 @@ def test_get_basic_selection_0d(store: StorePath, use_out: bool, value: Any, dty
 
     if use_out:
         # test out param
-        b = NDBuffer.from_numpy_array(np.zeros_like(arr_np))
+        b = default_buffer_prototype.nd_buffer.from_numpy_array(np.zeros_like(arr_np))
         arr_z.get_basic_selection(Ellipsis, out=b)
         assert_array_equal(arr_np, b.as_ndarray_like())
 
@@ -244,7 +244,9 @@ def _test_get_basic_selection(a, z, selection):
     assert_array_equal(expect, actual)
 
     # test out param
-    b = NDBuffer.from_numpy_array(np.empty(shape=expect.shape, dtype=expect.dtype))
+    b = default_buffer_prototype.nd_buffer.from_numpy_array(
+        np.empty(shape=expect.shape, dtype=expect.dtype)
+    )
     z.get_basic_selection(selection, out=b)
     assert_array_equal(expect, b.as_numpy_array())
 
@@ -1393,7 +1395,7 @@ def test_get_selection_out(store: StorePath):
     ]
     for selection in selections:
         expect = a[selection]
-        out = NDBuffer.from_numpy_array(np.empty(expect.shape))
+        out = default_buffer_prototype.nd_buffer.from_numpy_array(np.empty(expect.shape))
         z.get_basic_selection(selection, out=out)
         assert_array_equal(expect, out.as_numpy_array()[:])
 
@@ -1423,7 +1425,9 @@ def test_get_selection_out(store: StorePath):
         ]
         for selection in selections:
             expect = oindex(a, selection)
-            out = NDBuffer.from_numpy_array(np.zeros(expect.shape, dtype=expect.dtype))
+            out = default_buffer_prototype.nd_buffer.from_numpy_array(
+                np.zeros(expect.shape, dtype=expect.dtype)
+            )
             z.get_orthogonal_selection(selection, out=out)
             assert_array_equal(expect, out.as_numpy_array()[:])
 
@@ -1445,7 +1449,9 @@ def test_get_selection_out(store: StorePath):
         ]
         for selection in selections:
             expect = a[selection]
-            out = NDBuffer.from_numpy_array(np.zeros(expect.shape, dtype=expect.dtype))
+            out = default_buffer_prototype.nd_buffer.from_numpy_array(
+                np.zeros(expect.shape, dtype=expect.dtype)
+            )
             z.get_coordinate_selection(selection, out=out)
             assert_array_equal(expect, out.as_numpy_array()[:])
 

From ff40d3cca51e458752409f76b6f6ad71923200bf Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 00:07:23 -0700
Subject: [PATCH 18/46] Initial testing with cirun based GPU CI

---
 .cirun.yml                     |  9 +++++++
 .github/workflows/gpu_test.yml | 48 ++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 .cirun.yml
 create mode 100644 .github/workflows/gpu_test.yml

diff --git a/.cirun.yml b/.cirun.yml
new file mode 100644
index 0000000000..f58c2e866c
--- /dev/null
+++ b/.cirun.yml
@@ -0,0 +1,9 @@
+runners:
+  - name: "zarr-gpu-runner"
+    cloud: "gcp"
+    gpu: "nvidia-tesla-t4"
+    instance_type: "n1-standard-1"
+    machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
+    preemptible: true
+    labels:
+      - "cirun-gpu-runner"
diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
new file mode 100644
index 0000000000..f552dca8c0
--- /dev/null
+++ b/.github/workflows/gpu_test.yml
@@ -0,0 +1,48 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Test V3
+
+on:
+  push:
+    branches: [ v3 ]
+  pull_request:
+    branches: [ v3 ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: py=${{ matrix.python-version }}, np=${{ matrix.numpy-version }}, deps=${{ matrix.dependency-set }}
+
+    runs-on: "cirun-gpu-runner--${{ github.run_id }}"
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+        numpy-version: ['1.24', '1.26', '2.0']
+        dependency-set: ["minimal", "optional"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: GPU check
+      run: |
+        nvidia-smi
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    - name: Install Hatch
+      run: |
+        python -m pip install --upgrade pip 
+        pip install hatch
+    - name: Set Up Hatch Env
+      run: |
+        hatch env create test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }}
+        hatch env run -e test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} list-env
+    - name: Run Tests
+      run: |
+        hatch env run --env test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run

From e5cfd2fa7c5b2f1a12b2d5952ac3d4ebc22a28f0 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 00:35:16 -0700
Subject: [PATCH 19/46] Reverting to basic ubuntu machine image on GCP

---
 .cirun.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.cirun.yml b/.cirun.yml
index f58c2e866c..f38408e58a 100644
--- a/.cirun.yml
+++ b/.cirun.yml
@@ -3,7 +3,8 @@ runners:
     cloud: "gcp"
     gpu: "nvidia-tesla-t4"
     instance_type: "n1-standard-1"
-    machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
+    # machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
+    machine_image: "ubuntu-2204-jammy-v20240801"
     preemptible: true
     labels:
       - "cirun-gpu-runner"

From d473a3dd58405596b89e1a2276566596dd38de8b Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 00:49:24 -0700
Subject: [PATCH 20/46] Switching to cuda image from the docker registry

---
 .cirun.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirun.yml b/.cirun.yml
index f38408e58a..ff73cdaa1d 100644
--- a/.cirun.yml
+++ b/.cirun.yml
@@ -4,7 +4,7 @@ runners:
     gpu: "nvidia-tesla-t4"
     instance_type: "n1-standard-1"
     # machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
-    machine_image: "ubuntu-2204-jammy-v20240801"
+    machine_image: "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cu121.py310"
     preemptible: true
     labels:
       - "cirun-gpu-runner"

From 2a2e399a5d55f9b059cbdf1c21727e8e29f8be7f Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 07:52:57 -0700
Subject: [PATCH 21/46] Revert "Switching to cuda image from the docker
 registry"

This reverts commit d473a3dd58405596b89e1a2276566596dd38de8b.
---
 .cirun.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirun.yml b/.cirun.yml
index ff73cdaa1d..f38408e58a 100644
--- a/.cirun.yml
+++ b/.cirun.yml
@@ -4,7 +4,7 @@ runners:
     gpu: "nvidia-tesla-t4"
     instance_type: "n1-standard-1"
     # machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
-    machine_image: "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/base-cu121.py310"
+    machine_image: "ubuntu-2204-jammy-v20240801"
     preemptible: true
     labels:
       - "cirun-gpu-runner"

From b89ab9a22f13020b2a86c8cee54f2941492d4f8b Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 07:53:20 -0700
Subject: [PATCH 22/46] Revert "Reverting to basic ubuntu machine image on GCP"

This reverts commit e5cfd2fa7c5b2f1a12b2d5952ac3d4ebc22a28f0.
---
 .cirun.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.cirun.yml b/.cirun.yml
index f38408e58a..f58c2e866c 100644
--- a/.cirun.yml
+++ b/.cirun.yml
@@ -3,8 +3,7 @@ runners:
     cloud: "gcp"
     gpu: "nvidia-tesla-t4"
     instance_type: "n1-standard-1"
-    # machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
-    machine_image: "ubuntu-2204-jammy-v20240801"
+    machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
     preemptible: true
     labels:
       - "cirun-gpu-runner"

From c5a387dc947c60c116cbe66b798edcf1887947a0 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 07:53:24 -0700
Subject: [PATCH 23/46] Revert "Initial testing with cirun based GPU CI"

This reverts commit ff40d3cca51e458752409f76b6f6ad71923200bf.
---
 .cirun.yml                     |  9 -------
 .github/workflows/gpu_test.yml | 48 ----------------------------------
 2 files changed, 57 deletions(-)
 delete mode 100644 .cirun.yml
 delete mode 100644 .github/workflows/gpu_test.yml

diff --git a/.cirun.yml b/.cirun.yml
deleted file mode 100644
index f58c2e866c..0000000000
--- a/.cirun.yml
+++ /dev/null
@@ -1,9 +0,0 @@
-runners:
-  - name: "zarr-gpu-runner"
-    cloud: "gcp"
-    gpu: "nvidia-tesla-t4"
-    instance_type: "n1-standard-1"
-    machine_image: "common-cu123-v20240730-ubuntu-2204-py310"
-    preemptible: true
-    labels:
-      - "cirun-gpu-runner"
diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
deleted file mode 100644
index f552dca8c0..0000000000
--- a/.github/workflows/gpu_test.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Test V3
-
-on:
-  push:
-    branches: [ v3 ]
-  pull_request:
-    branches: [ v3 ]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  test:
-    name: py=${{ matrix.python-version }}, np=${{ matrix.numpy-version }}, deps=${{ matrix.dependency-set }}
-
-    runs-on: "cirun-gpu-runner--${{ github.run_id }}"
-    strategy:
-      matrix:
-        python-version: ['3.10', '3.11', '3.12']
-        numpy-version: ['1.24', '1.26', '2.0']
-        dependency-set: ["minimal", "optional"]
-
-    steps:
-    - uses: actions/checkout@v4
-    - name: GPU check
-      run: |
-        nvidia-smi
-    - name: Set up Python
-      uses: actions/setup-python@v5
-      with:
-        python-version: ${{ matrix.python-version }}
-        cache: 'pip'
-    - name: Install Hatch
-      run: |
-        python -m pip install --upgrade pip 
-        pip install hatch
-    - name: Set Up Hatch Env
-      run: |
-        hatch env create test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }}
-        hatch env run -e test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} list-env
-    - name: Run Tests
-      run: |
-        hatch env run --env test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run

From 72d172d393231337865d4deaaf3d782705bd9aaf Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 16:18:42 -0700
Subject: [PATCH 24/46] Adding pytest mark for GPU tests

---
 pyproject.toml            |  3 +++
 src/zarr/testing/utils.py | 22 ++++++++++++++++++++++
 tests/v3/conftest.py      |  3 +++
 tests/v3/test_buffer.py   |  5 +++--
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6fb1729314..83c0949292 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -225,3 +225,6 @@ filterwarnings = [
     "ignore:The loop argument is deprecated since Python 3.8.*:DeprecationWarning",
     "ignore:Creating a zarr.buffer.gpu.*:UserWarning",
 ]
+markers = [
+    "gpu: mark a test as requiring CuPy and GPU"
+]
diff --git a/src/zarr/testing/utils.py b/src/zarr/testing/utils.py
index 67c6c72de7..65df6ed203 100644
--- a/src/zarr/testing/utils.py
+++ b/src/zarr/testing/utils.py
@@ -1,5 +1,9 @@
 from __future__ import annotations
 
+from typing import Any, cast
+
+import pytest
+
 from zarr.buffer import Buffer
 from zarr.common import BytesLike
 
@@ -16,3 +20,21 @@ def assert_bytes_equal(b1: Buffer | BytesLike | None, b2: Buffer | BytesLike | N
     if isinstance(b2, Buffer):
         b2 = b2.to_bytes()
     assert b1 == b2
+
+
+def has_cupy() -> bool:
+    try:
+        import cupy
+
+        return cast(bool, cupy.cuda.runtime.getDeviceCount() > 0)
+    except ImportError:
+        return False
+    except cupy.cuda.runtime.CUDARuntimeError:
+        return False
+
+
+# Decorator for GPU tests
+def gpu_test(func: Any) -> Any:
+    return pytest.mark.gpu(
+        pytest.mark.skipif(not has_cupy(), reason="CuPy not installed or no GPU available")(func)
+    )
diff --git a/tests/v3/conftest.py b/tests/v3/conftest.py
index 8b75d9f2f8..0669f676ac 100644
--- a/tests/v3/conftest.py
+++ b/tests/v3/conftest.py
@@ -93,6 +93,9 @@ async def async_group(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> As
 def xp(request: pytest.FixtureRequest) -> Iterator[ModuleType]:
     """Fixture to parametrize over numpy-like libraries"""
 
+    if request.param == "cupy":
+        request.node.add_marker(pytest.mark.gpu)
+
     yield pytest.importorskip(request.param)
 
 
diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py
index 38e724bc51..6adc86494a 100644
--- a/tests/v3/test_buffer.py
+++ b/tests/v3/test_buffer.py
@@ -24,6 +24,7 @@
 from zarr.codecs.zstd import ZstdCodec
 from zarr.store.core import StorePath
 from zarr.store.memory import MemoryStore
+from zarr.testing.utils import gpu_test
 
 if TYPE_CHECKING:
     from typing_extensions import Self
@@ -118,7 +119,7 @@ async def test_async_array_prototype():
     assert np.array_equal(expect, got)
 
 
-@pytest.mark.skipif(cp is None, reason="requires cupy")
+@gpu_test
 @pytest.mark.asyncio
 async def test_async_array_gpu_prototype():
     """Test the use of the GPU buffer prototype"""
@@ -175,7 +176,7 @@ async def test_codecs_use_of_prototype():
     assert np.array_equal(expect, got)
 
 
-@pytest.mark.skipif(cp is None, reason="requires cupy")
+@gpu_test
 @pytest.mark.asyncio
 async def test_codecs_use_of_gpu_prototype():
     expect = cp.zeros((10, 10), dtype="uint16", order="F")

From 3db61bd09ae6b2448ca3acc2a3f6e046fb8c5529 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 16:21:49 -0700
Subject: [PATCH 25/46] Updating GPU memory store test with gpu mark

---
 tests/v3/test_store/test_memory.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/v3/test_store/test_memory.py b/tests/v3/test_store/test_memory.py
index fc8e813579..bb4ed9e3d6 100644
--- a/tests/v3/test_store/test_memory.py
+++ b/tests/v3/test_store/test_memory.py
@@ -5,11 +5,7 @@
 from zarr.buffer import Buffer, cpu, gpu
 from zarr.store.memory import GpuMemoryStore, MemoryStore
 from zarr.testing.store import StoreTests
-
-try:
-    import cupy as cp
-except ImportError:
-    cp = None
+from zarr.testing.utils import gpu_test
 
 
 class TestMemoryStore(StoreTests[MemoryStore, cpu.Buffer]):
@@ -46,7 +42,7 @@ def test_list_prefix(self, store: MemoryStore) -> None:
         assert True
 
 
-@pytest.mark.skipif(cp is None, reason="requires cupy")
+@gpu_test
 class TestGpuMemoryStore(StoreTests[GpuMemoryStore, gpu.Buffer]):
     store_cls = GpuMemoryStore
     buffer_cls = gpu.Buffer

From 425c3f8989732122cd90e8ae2f72238d8cb8ae24 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 9 Aug 2024 16:37:16 -0700
Subject: [PATCH 26/46] Adding GPU workflow that only runs GPU tests

---
 .github/workflows/gpu_test.yml | 48 ++++++++++++++++++++++++++++++++++
 pyproject.toml                 |  1 +
 2 files changed, 49 insertions(+)
 create mode 100644 .github/workflows/gpu_test.yml

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
new file mode 100644
index 0000000000..c7b10b795e
--- /dev/null
+++ b/.github/workflows/gpu_test.yml
@@ -0,0 +1,48 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: GPU Test V3
+
+on:
+  push:
+    branches: [ v3 ]
+  pull_request:
+    branches: [ v3 ]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: py=${{ matrix.python-version }}, np=${{ matrix.numpy-version }}, deps=${{ matrix.dependency-set }}
+
+    runs-on: gpu-runner
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+        numpy-version: ['1.24', '1.26', '2.0']
+        dependency-set: ["minimal", "optional"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: GPU check
+      run: |
+        nvidia-smi
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    - name: Install Hatch
+      run: |
+        python -m pip install --upgrade pip 
+        pip install hatch
+    - name: Set Up Hatch Env
+      run: |
+        hatch env create test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }}
+        hatch env run -e test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} list-env
+    - name: Run Tests
+      run: |
+        hatch env run --env test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run-coverage-gpu
diff --git a/pyproject.toml b/pyproject.toml
index 83c0949292..28d2711aa6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -139,6 +139,7 @@ features = ["optional"]
 
 [tool.hatch.envs.test.scripts]
 run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=tests"
+run-coverage-gpu = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests"
 run = "run-coverage --no-cov"
 run-verbose = "run-coverage --verbose"
 run-mypy = "mypy src"

From c8c7e6dcd69c203a37448f9375c6dbdb1443ea48 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Wed, 21 Aug 2024 11:54:59 -0700
Subject: [PATCH 27/46] Formatting

---
 tests/v3/test_buffer.py | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py
index 9c168e5c4b..50acb9c18e 100644
--- a/tests/v3/test_buffer.py
+++ b/tests/v3/test_buffer.py
@@ -19,13 +19,13 @@
 from zarr.codecs.zstd import ZstdCodec
 from zarr.store.core import StorePath
 from zarr.store.memory import MemoryStore
-from zarr.testing.utils import gpu_test
 from zarr.testing.buffer import (
     NDBufferUsingTestNDArrayLike,
     StoreExpectingTestBuffer,
     TestBuffer,
     TestNDArrayLike,
 )
+from zarr.testing.utils import gpu_test
 
 try:
     import cupy as cp
@@ -53,9 +53,7 @@ async def test_async_array_prototype():
     )
     expect[1:4, 3:6] = np.ones((3, 3))
 
-    my_prototype = BufferPrototype(
-        buffer=TestBuffer, nd_buffer=NDBufferUsingTestNDArrayLike
-    )
+    my_prototype = BufferPrototype(buffer=TestBuffer, nd_buffer=NDBufferUsingTestNDArrayLike)
 
     await a.setitem(
         selection=(slice(1, 4), slice(3, 6)),
@@ -87,9 +85,7 @@ async def test_async_array_gpu_prototype():
         value=cp.ones((3, 3)),
         prototype=gpu.buffer_prototype,
     )
-    got = await a.getitem(
-        selection=(slice(0, 9), slice(0, 9)), prototype=gpu.buffer_prototype
-    )
+    got = await a.getitem(selection=(slice(0, 9), slice(0, 9)), prototype=gpu.buffer_prototype)
     assert isinstance(got, cp.ndarray)
     assert cp.array_equal(expect, got)
 
@@ -114,18 +110,14 @@ async def test_codecs_use_of_prototype():
     )
     expect[:] = np.arange(100).reshape(10, 10)
 
-    my_prototype = BufferPrototype(
-        buffer=TestBuffer, nd_buffer=NDBufferUsingTestNDArrayLike
-    )
+    my_prototype = BufferPrototype(buffer=TestBuffer, nd_buffer=NDBufferUsingTestNDArrayLike)
 
     await a.setitem(
         selection=(slice(0, 10), slice(0, 10)),
         value=expect[:],
         prototype=my_prototype,
     )
-    got = await a.getitem(
-        selection=(slice(0, 10), slice(0, 10)), prototype=my_prototype
-    )
+    got = await a.getitem(selection=(slice(0, 10), slice(0, 10)), prototype=my_prototype)
     assert isinstance(got, TestNDArrayLike)
     assert np.array_equal(expect, got)
 
@@ -156,17 +148,13 @@ async def test_codecs_use_of_gpu_prototype():
         value=expect[:],
         prototype=gpu.buffer_prototype,
     )
-    got = await a.getitem(
-        selection=(slice(0, 10), slice(0, 10)), prototype=gpu.buffer_prototype
-    )
+    got = await a.getitem(selection=(slice(0, 10), slice(0, 10)), prototype=gpu.buffer_prototype)
     assert isinstance(got, cp.ndarray)
     assert cp.array_equal(expect, got)
 
 
 def test_numpy_buffer_prototype():
     buffer = cpu.buffer_prototype().buffer.create_zero_length()
-    ndbuffer = cpu.buffer_prototype().nd_buffer.create(
-        shape=(1, 2), dtype=np.dtype("int64")
-    )
+    ndbuffer = cpu.buffer_prototype().nd_buffer.create(shape=(1, 2), dtype=np.dtype("int64"))
     assert isinstance(buffer.as_array_like(), np.ndarray)
     assert isinstance(ndbuffer.as_ndarray_like(), np.ndarray)

From 25a67ca7fa4187bc080725188d1807bae52252a9 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 22 Aug 2024 22:25:31 -0700
Subject: [PATCH 28/46] Fixing mypy errors in buffer code

---
 src/zarr/buffer/__init__.py |  4 +++-
 src/zarr/buffer/core.py     | 11 -----------
 src/zarr/buffer/cpu.py      | 17 +++++++++++++++--
 tests/v3/test_group.py      |  1 +
 4 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/zarr/buffer/__init__.py b/src/zarr/buffer/__init__.py
index d4de3b2786..79668b365e 100644
--- a/src/zarr/buffer/__init__.py
+++ b/src/zarr/buffer/__init__.py
@@ -4,8 +4,9 @@
     BufferPrototype,
     NDArrayLike,
     NDBuffer,
+    default_buffer_prototype,
 )
-from zarr.buffer.cpu import default_buffer_prototype
+from zarr.buffer.cpu import numpy_buffer_prototype
 
 __all__ = [
     "ArrayLike",
@@ -14,4 +15,5 @@
     "NDBuffer",
     "BufferPrototype",
     "default_buffer_prototype",
+    "numpy_buffer_prototype",
 ]
diff --git a/src/zarr/buffer/core.py b/src/zarr/buffer/core.py
index 20722ae036..aa724b0e88 100644
--- a/src/zarr/buffer/core.py
+++ b/src/zarr/buffer/core.py
@@ -21,8 +21,6 @@
 from zarr.registry import (
     get_buffer_class,
     get_ndbuffer_class,
-    register_buffer,
-    register_ndbuffer,
 )
 
 if TYPE_CHECKING:
@@ -488,12 +486,3 @@ class BufferPrototype(NamedTuple):
 # The default buffer prototype used throughout the Zarr codebase.
 def default_buffer_prototype() -> BufferPrototype:
     return BufferPrototype(buffer=get_buffer_class(), nd_buffer=get_ndbuffer_class())
-
-
-# The numpy prototype used for E.g. when reading the shard index
-def numpy_buffer_prototype() -> BufferPrototype:
-    return BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
-
-
-register_buffer(Buffer)
-register_ndbuffer(NDBuffer)
diff --git a/src/zarr/buffer/cpu.py b/src/zarr/buffer/cpu.py
index 6a44f30b70..448093acd9 100644
--- a/src/zarr/buffer/cpu.py
+++ b/src/zarr/buffer/cpu.py
@@ -12,6 +12,10 @@
 
 from zarr.buffer import core
 from zarr.buffer.core import ArrayLike, NDArrayLike
+from zarr.registry import (
+    register_buffer,
+    register_ndbuffer,
+)
 
 if TYPE_CHECKING:
     from typing_extensions import Self
@@ -210,5 +214,14 @@ def as_numpy_array_wrapper(
 
 
 # CPU buffer prototype using numpy arrays
-buffer_prototype = core.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
-default_buffer_prototype = buffer_prototype
+# buffer_prototype = core.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
+# default_buffer_prototype = buffer_prototype
+
+
+# The numpy prototype used for E.g. when reading the shard index
+def numpy_buffer_prototype() -> core.BufferPrototype:
+    return core.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
+
+
+register_buffer(Buffer)
+register_ndbuffer(NDBuffer)
diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py
index d1fd0be542..022c025a68 100644
--- a/tests/v3/test_group.py
+++ b/tests/v3/test_group.py
@@ -7,6 +7,7 @@
 from _pytest.compat import LEGACY_PATH
 
 from zarr.array import Array, AsyncArray
+from zarr.buffer import default_buffer_prototype
 from zarr.common import ZarrFormat
 from zarr.errors import ContainsArrayError, ContainsGroupError
 from zarr.group import AsyncGroup, Group, GroupMetadata

From ac061d98df16d32a82b0f27ea5a4c77ac54135d6 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 22 Aug 2024 23:22:59 -0700
Subject: [PATCH 29/46] Fixing errors in test_buffer.py

---
 src/zarr/core/buffer/cpu.py | 2 +-
 src/zarr/core/config.py     | 4 ++--
 src/zarr/testing/buffer.py  | 6 +++---
 tests/v3/test_buffer.py     | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/zarr/core/buffer/cpu.py b/src/zarr/core/buffer/cpu.py
index 45d204789d..0953805223 100644
--- a/src/zarr/core/buffer/cpu.py
+++ b/src/zarr/core/buffer/cpu.py
@@ -214,7 +214,7 @@ def as_numpy_array_wrapper(
 
 
 # CPU buffer prototype using numpy arrays
-# buffer_prototype = core.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
+buffer_prototype = core.BufferPrototype(buffer=Buffer, nd_buffer=NDBuffer)
 # default_buffer_prototype = buffer_prototype
 
 
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
index 67f5c74347..611d1faea5 100644
--- a/src/zarr/core/config.py
+++ b/src/zarr/core/config.py
@@ -60,8 +60,8 @@ def reset(self) -> None:
                 "sharding_indexed": "zarr.codecs.sharding.ShardingCodec",
                 "transpose": "zarr.codecs.transpose.TransposeCodec",
             },
-            "buffer": "zarr.core.buffer.Buffer",
-            "ndbuffer": "zarr.core.buffer.NDBuffer",
+            "buffer": "zarr.core.buffer.cpu.Buffer",
+            "ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
         }
     ],
 )
diff --git a/src/zarr/testing/buffer.py b/src/zarr/testing/buffer.py
index ee170a5dd3..bbdb52e8cc 100644
--- a/src/zarr/testing/buffer.py
+++ b/src/zarr/testing/buffer.py
@@ -7,7 +7,7 @@
 import numpy as np
 import numpy.typing as npt
 
-from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer
+from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer, cpu
 from zarr.store import MemoryStore
 
 if TYPE_CHECKING:
@@ -25,11 +25,11 @@ class TestNDArrayLike(np.ndarray):
     """An example of a ndarray-like class"""
 
 
-class TestBuffer(Buffer):
+class TestBuffer(cpu.Buffer):
     """Example of a custom Buffer that handles ArrayLike"""
 
 
-class NDBufferUsingTestNDArrayLike(NDBuffer):
+class NDBufferUsingTestNDArrayLike(cpu.NDBuffer):
     """Example of a custom NDBuffer that handles MyNDArrayLike"""
 
     @classmethod
diff --git a/tests/v3/test_buffer.py b/tests/v3/test_buffer.py
index b8f91bc8cc..419f01c720 100644
--- a/tests/v3/test_buffer.py
+++ b/tests/v3/test_buffer.py
@@ -148,7 +148,7 @@ async def test_codecs_use_of_gpu_prototype():
 
 
 def test_numpy_buffer_prototype():
-    buffer = cpu.buffer_prototype().buffer.create_zero_length()
-    ndbuffer = cpu.buffer_prototype().nd_buffer.create(shape=(1, 2), dtype=np.dtype("int64"))
+    buffer = cpu.buffer_prototype.buffer.create_zero_length()
+    ndbuffer = cpu.buffer_prototype.nd_buffer.create(shape=(1, 2), dtype=np.dtype("int64"))
     assert isinstance(buffer.as_array_like(), np.ndarray)
     assert isinstance(ndbuffer.as_ndarray_like(), np.ndarray)

From 523d8d59b7e3e4b2e9e92e692aedef5e540003ae Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 22 Aug 2024 23:23:24 -0700
Subject: [PATCH 30/46] Fixing errors in test_buffer.py

---
 src/zarr/testing/buffer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/zarr/testing/buffer.py b/src/zarr/testing/buffer.py
index bbdb52e8cc..a6120ef2f9 100644
--- a/src/zarr/testing/buffer.py
+++ b/src/zarr/testing/buffer.py
@@ -7,7 +7,7 @@
 import numpy as np
 import numpy.typing as npt
 
-from zarr.core.buffer import Buffer, BufferPrototype, NDBuffer, cpu
+from zarr.core.buffer import Buffer, BufferPrototype, cpu
 from zarr.store import MemoryStore
 
 if TYPE_CHECKING:

From b559ee46bfe1520675aed29182d196fb9779fb16 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 23 Aug 2024 13:48:20 -0700
Subject: [PATCH 31/46] Fixing store test errors

---
 src/zarr/testing/store.py          | 4 ++--
 tests/v3/test_store/test_memory.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py
index 772a18fdbe..42e83f063d 100644
--- a/src/zarr/testing/store.py
+++ b/src/zarr/testing/store.py
@@ -161,11 +161,11 @@ async def test_delete(self, store: S) -> None:
 
     async def test_empty(self, store: S) -> None:
         assert await store.empty()
-        self.set(store, "key", Buffer.from_bytes(bytes("something", encoding="utf-8")))
+        self.set(store, "key", self.buffer_cls.from_bytes(bytes("something", encoding="utf-8")))
         assert not await store.empty()
 
     async def test_clear(self, store: S) -> None:
-        self.set(store, "key", Buffer.from_bytes(bytes("something", encoding="utf-8")))
+        self.set(store, "key", self.buffer_cls.from_bytes(bytes("something", encoding="utf-8")))
         await store.clear()
         assert await store.empty()
 
diff --git a/tests/v3/test_store/test_memory.py b/tests/v3/test_store/test_memory.py
index 1fc6a7b88f..f76423c631 100644
--- a/tests/v3/test_store/test_memory.py
+++ b/tests/v3/test_store/test_memory.py
@@ -57,7 +57,7 @@ def get(self, store: MemoryStore, key: str) -> Buffer:
 
     @pytest.fixture(scope="function", params=[None, {}])
     def store_kwargs(self, request) -> dict[str, str | None | dict[str, Buffer]]:
-        return {"store_dict": request.param, "mode": "w"}
+        return {"store_dict": request.param, "mode": "r+"}
 
     @pytest.fixture(scope="function")
     def store(self, store_kwargs: str | None | dict[str, gpu.Buffer]) -> GpuMemoryStore:

From 26a74f4c3746002b96444364fd5027b694220aa4 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 23 Aug 2024 13:51:49 -0700
Subject: [PATCH 32/46] Fixing stateful store test

---
 tests/v3/test_store/test_stateful_store.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v3/test_store/test_stateful_store.py b/tests/v3/test_store/test_stateful_store.py
index 68bd11bbe8..a8f51e96e6 100644
--- a/tests/v3/test_store/test_stateful_store.py
+++ b/tests/v3/test_store/test_stateful_store.py
@@ -12,7 +12,7 @@
 
 import zarr
 from zarr.abc.store import AccessMode, Store
-from zarr.core.buffer import Buffer, BufferPrototype, default_buffer_prototype
+from zarr.core.buffer import BufferPrototype, cpu, default_buffer_prototype
 from zarr.store import MemoryStore
 from zarr.testing.strategies import key_ranges, paths
 
@@ -112,7 +112,7 @@ def __init__(self):
     def set(self, key: str, data: bytes) -> None:
         note(f"(set) Setting {key!r} with {data}")
         assert not self.store.mode.readonly
-        data_buf = Buffer.from_bytes(data)
+        data_buf = cpu.Buffer.from_bytes(data)
         self.store.set(key, data_buf)
         self.model[key] = data_buf
 

From 73078330cb8f4bb4c6f08c4fed6536461aef481e Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 23 Aug 2024 14:33:50 -0700
Subject: [PATCH 33/46] Fixing config test

---
 tests/v3/test_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v3/test_config.py b/tests/v3/test_config.py
index 881833797c..b3e04e51da 100644
--- a/tests/v3/test_config.py
+++ b/tests/v3/test_config.py
@@ -46,8 +46,8 @@ def test_config_defaults_set() -> None:
                 "path": "zarr.codecs.pipeline.BatchedCodecPipeline",
                 "batch_size": 1,
             },
-            "buffer": "zarr.core.buffer.Buffer",
-            "ndbuffer": "zarr.core.buffer.NDBuffer",
+            "buffer": "zarr.core.buffer.cpu.Buffer",
+            "ndbuffer": "zarr.core.buffer.cpu.NDBuffer",
             "codecs": {
                 "blosc": "zarr.codecs.blosc.BloscCodec",
                 "gzip": "zarr.codecs.gzip.GzipCodec",

From f6fddd9bb8efffc035e8da60547c7a774064189b Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 23 Aug 2024 14:36:41 -0700
Subject: [PATCH 34/46] Fixing group tests

---
 tests/v3/test_group.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py
index 8b3ca3d0fd..a9812170dc 100644
--- a/tests/v3/test_group.py
+++ b/tests/v3/test_group.py
@@ -96,14 +96,16 @@ def test_group_members(store: MemoryStore | LocalStore, zarr_format: ZarrFormat)
 
     # add an extra object to the domain of the group.
     # the list of children should ignore this object.
-    sync(store.set(f"{path}/extra_object-1", default_buffer_prototype.buffer.from_bytes(b"000000")))
+    sync(
+        store.set(f"{path}/extra_object-1", default_buffer_prototype().buffer.from_bytes(b"000000"))
+    )
     # add an extra object under a directory-like prefix in the domain of the group.
     # this creates a directory with a random key in it
     # this should not show up as a member
     sync(
         store.set(
             f"{path}/extra_directory/extra_object-2",
-            default_buffer_prototype.buffer.from_bytes(b"000000"),
+            default_buffer_prototype().buffer.from_bytes(b"000000"),
         )
     )
     members_observed = group.members

From 2b1fe142e416bff8e39ab09bbb415ccbc7121e1d Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 23 Aug 2024 14:38:41 -0700
Subject: [PATCH 35/46] Fixing indexing tests

---
 tests/v3/test_indexing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/v3/test_indexing.py b/tests/v3/test_indexing.py
index 9900f26651..5805de3035 100644
--- a/tests/v3/test_indexing.py
+++ b/tests/v3/test_indexing.py
@@ -136,7 +136,7 @@ def test_get_basic_selection_0d(store: StorePath, use_out: bool, value: Any, dty
 
     if use_out:
         # test out param
-        b = default_buffer_prototype.nd_buffer.from_numpy_array(np.zeros_like(arr_np))
+        b = default_buffer_prototype().nd_buffer.from_numpy_array(np.zeros_like(arr_np))
         arr_z.get_basic_selection(Ellipsis, out=b)
         assert_array_equal(arr_np, b.as_ndarray_like())
 
@@ -247,7 +247,7 @@ def _test_get_basic_selection(a, z, selection):
     assert_array_equal(expect, actual)
 
     # test out param
-    b = default_buffer_prototype.nd_buffer.from_numpy_array(
+    b = default_buffer_prototype().nd_buffer.from_numpy_array(
         np.empty(shape=expect.shape, dtype=expect.dtype)
     )
     z.get_basic_selection(selection, out=b)

From abd135f6b23c019d96013122f89436adbe6cd1b2 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 23 Aug 2024 15:32:45 -0700
Subject: [PATCH 36/46] Manually installing cupy in the GPU workflow

---
 .github/workflows/gpu_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index c7b10b795e..47993cbc0d 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -35,10 +35,10 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
         cache: 'pip'
-    - name: Install Hatch
+    - name: Install Hatch and CuPy
       run: |
         python -m pip install --upgrade pip 
-        pip install hatch
+        pip install hatch cupy-cuda12x
     - name: Set Up Hatch Env
       run: |
         hatch env create test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }}

From 1db58e73becd35d2298b5b3d3b8553c41662a04f Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Fri, 23 Aug 2024 17:00:28 -0700
Subject: [PATCH 37/46] Ablating GPU test matrix and adding gpu optional
 dependencies to the hatch dependency list

---
 .github/workflows/gpu_test.yml |  6 +++---
 pyproject.toml                 | 11 ++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index 47993cbc0d..c460d40b4c 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -21,9 +21,9 @@ jobs:
     runs-on: gpu-runner
     strategy:
       matrix:
-        python-version: ['3.10', '3.11', '3.12']
-        numpy-version: ['1.24', '1.26', '2.0']
-        dependency-set: ["minimal", "optional"]
+        python-version: ['3.11']
+        numpy-version: ['2.0']
+        dependency-set: ["gpu"]
 
     steps:
     - uses: actions/checkout@v4
diff --git a/pyproject.toml b/pyproject.toml
index 4cb70b9fb3..161e8827ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,7 +75,7 @@ jupyter = [
     'ipywidgets>=8.0.0',
 ]
 gpu = [
-    "cupy>=13.0.0",
+    "cupy-cuda12x",
 ]
 docs = [
     'sphinx',
@@ -123,9 +123,9 @@ build.hooks.vcs.version-file = "src/zarr/_version.py"
 [tool.hatch.envs.test]
 dependencies = [
     "numpy~={matrix:numpy}",
-    "universal_pathlib"
+    "universal_pathlib",
 ]
-features = ["test", "extra"]
+features = ["test", "extra", "gpu"]
 
 [[tool.hatch.envs.test.matrix]]
 python = ["3.10", "3.11", "3.12"]
@@ -137,6 +137,11 @@ python = ["3.10", "3.11", "3.12"]
 numpy = ["1.24", "1.26", "2.0"]
 features = ["optional"]
 
+[[tool.hatch.envs.test.matrix]]
+python = ["3.10", "3.11", "3.12"]
+numpy = ["1.24", "1.26", "2.0"]
+features = ["gpu"]
+
 [tool.hatch.envs.test.scripts]
 run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=tests"
 run-coverage-gpu = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests"

From 296bd022d831d0071716247044bbfd4323ea17b7 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 26 Aug 2024 11:29:22 -0700
Subject: [PATCH 38/46] Adding some more logging to debug GPU test failures

---
 .github/workflows/gpu_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index c460d40b4c..9956d0f753 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -29,7 +29,7 @@ jobs:
     - uses: actions/checkout@v4
     - name: GPU check
       run: |
-        nvidia-smi
+        nvidia-smi && ls -l /usr/local/cuda/lib64/ && echo ${LD_LIBRARY_PATH}
     - name: Set up Python
       uses: actions/setup-python@v5
       with:

From b33c887d494321bbd14ec93d1fc6c56244bcf523 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 26 Aug 2024 14:23:11 -0700
Subject: [PATCH 39/46] Adding GA step to install the CUDA toolkit

---
 .github/workflows/gpu_test.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index 9956d0f753..13acb44b2f 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -27,9 +27,18 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+    - name: cuda-toolkit
+      uses: Jimver/cuda-toolkit@v0.2.16
+      id: cuda-toolkit
+      with:
+        cuda: '12.5.0'
+      run: |
+        echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"
+        echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+        nvcc -V
     - name: GPU check
       run: |
-        nvidia-smi && ls -l /usr/local/cuda/lib64/ && echo ${LD_LIBRARY_PATH}
+        nvidia-smi && which nvcc && ls -l /usr/local/cuda/lib64/ && echo ${LD_LIBRARY_PATH}
     - name: Set up Python
       uses: actions/setup-python@v5
       with:

From e0da0fb193eabe40c61c4631aa99e06c276ba522 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Mon, 26 Aug 2024 17:28:14 -0700
Subject: [PATCH 40/46] Adding a separate gputest hatch environment to simplify
 GPU testing

---
 .github/workflows/gpu_test.yml | 10 +++++-----
 pyproject.toml                 | 24 ++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index 13acb44b2f..7ff3457c6d 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -23,7 +23,7 @@ jobs:
       matrix:
         python-version: ['3.11']
         numpy-version: ['2.0']
-        dependency-set: ["gpu"]
+        dependency-set: ["minimal"]
 
     steps:
     - uses: actions/checkout@v4
@@ -47,11 +47,11 @@ jobs:
     - name: Install Hatch and CuPy
       run: |
         python -m pip install --upgrade pip 
-        pip install hatch cupy-cuda12x
+        pip install hatch
     - name: Set Up Hatch Env
       run: |
-        hatch env create test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }}
-        hatch env run -e test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} list-env
+        hatch env create gputest.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }}
+        hatch env run -e gputest.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} list-env
     - name: Run Tests
       run: |
-        hatch env run --env test.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run-coverage-gpu
+        hatch env run --env gputest.py${{ matrix.python-version }}-${{ matrix.numpy-version }}-${{ matrix.dependency-set }} run-coverage
diff --git a/pyproject.toml b/pyproject.toml
index 161e8827ce..8bee491a82 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -125,7 +125,7 @@ dependencies = [
     "numpy~={matrix:numpy}",
     "universal_pathlib",
 ]
-features = ["test", "extra", "gpu"]
+features = ["test", "extra"]
 
 [[tool.hatch.envs.test.matrix]]
 python = ["3.10", "3.11", "3.12"]
@@ -144,7 +144,27 @@ features = ["gpu"]
 
 [tool.hatch.envs.test.scripts]
 run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=tests"
-run-coverage-gpu = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests"
+run-coverage-gpu = "pip install cupy-cuda12x && pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests"
+run = "run-coverage --no-cov"
+run-verbose = "run-coverage --verbose"
+run-mypy = "mypy src"
+run-hypothesis = "pytest --hypothesis-profile ci tests/v3/test_properties.py tests/v3/test_store/test_stateful*"
+list-env = "pip list"
+
+[tool.hatch.envs.gputest]
+dependencies = [
+    "numpy~={matrix:numpy}",
+    "universal_pathlib",
+]
+features = ["test", "extra", "gpu"]
+
+[[tool.hatch.envs.gputest.matrix]]
+python = ["3.10", "3.11", "3.12"]
+numpy = ["1.24", "1.26", "2.0"]
+version = ["minimal"]
+
+[tool.hatch.envs.gputest.scripts]
+run-coverage = "pytest -m gpu --cov-config=pyproject.toml --cov=pkg --cov=tests"
 run = "run-coverage --no-cov"
 run-verbose = "run-coverage --verbose"
 run-mypy = "mypy src"

From 07277afd347b20b106f9abcfa89d1b656f1a33cb Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Wed, 28 Aug 2024 13:51:50 -0700
Subject: [PATCH 41/46] Fixing error in cuda-toolkit step

---
 .github/workflows/gpu_test.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index 7ff3457c6d..531aba615e 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -32,13 +32,12 @@ jobs:
       id: cuda-toolkit
       with:
         cuda: '12.5.0'
+    - name: GPU check
       run: |
         echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"
         echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
         nvcc -V
-    - name: GPU check
-      run: |
-        nvidia-smi && which nvcc && ls -l /usr/local/cuda/lib64/ && echo ${LD_LIBRARY_PATH}
+        nvidia-smi
     - name: Set up Python
       uses: actions/setup-python@v5
       with:

From 6e49e85f6853437e2d3bf814c64c45e22fd8e3a5 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:25:12 -0700
Subject: [PATCH 42/46] Downgrading to CUDA 12.4.1 in cuda-toolkit GA

---
 .github/workflows/gpu_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index 531aba615e..fafb35a5f4 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -31,7 +31,7 @@ jobs:
       uses: Jimver/cuda-toolkit@v0.2.16
       id: cuda-toolkit
       with:
-        cuda: '12.5.0'
+        cuda: '12.4.1'
     - name: GPU check
       run: |
         echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"

From 02c319cae17acb65eedfecf4ae04e08be1005769 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Wed, 28 Aug 2024 19:44:47 -0700
Subject: [PATCH 43/46] Trying manual install of the CUDA toolkit

---
 .github/workflows/gpu_test.yml | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index fafb35a5f4..2ce8278650 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -27,15 +27,17 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
-    - name: cuda-toolkit
-      uses: Jimver/cuda-toolkit@v0.2.16
-      id: cuda-toolkit
-      with:
-        cuda: '12.4.1'
+    # - name: cuda-toolkit
+    #   uses: Jimver/cuda-toolkit@v0.2.16
+    #   id: cuda-toolkit
+    #   with:
+    #     cuda: '12.4.1'
     - name: GPU check
       run: |
-        echo "Installed cuda version is: ${{steps.cuda-toolkit.outputs.cuda}}"
-        echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
+        sudo dpkg -i cuda-keyring_1.1-1_all.deb
+        sudo apt-get update
+        sudo apt-get -y install cuda-toolkit-12-6
         nvcc -V
         nvidia-smi
     - name: Set up Python

From e82ddc1539c1a8dcbd0fe0a24791e4c07a784180 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Wed, 28 Aug 2024 22:23:12 -0700
Subject: [PATCH 44/46] Updating environment variables with CUDA installation

---
 .github/workflows/gpu_test.yml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index 2ce8278650..467f67d428 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -10,6 +10,10 @@ on:
     branches: [ v3 ]
   workflow_dispatch:
 
+env:
+  PATH: /usr/local/cuda/bin
+  LD_LIBRARY_PATH: /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
@@ -32,14 +36,19 @@ jobs:
     #   id: cuda-toolkit
     #   with:
     #     cuda: '12.4.1'
-    - name: GPU check
+    - name: Set up CUDA
       run: |
         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb
         sudo dpkg -i cuda-keyring_1.1-1_all.deb
         sudo apt-get update
         sudo apt-get -y install cuda-toolkit-12-6
-        nvcc -V
+        echo "/usr/local/cuda/bin" >> $GITHUB_PATH
+    - name: GPU check
+      run: |
         nvidia-smi
+        echo $PATH
+        echo $LD_LIBRARY_PATH
+        nvcc -V
     - name: Set up Python
       uses: actions/setup-python@v5
       with:

From 7854ce909c4c850b6b37f680075fcc4a2dc08a3f Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 29 Aug 2024 08:23:32 -0700
Subject: [PATCH 45/46] Removing PATH env and setting it only through
 GITHUB_PATH

---
 .github/workflows/gpu_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/gpu_test.yml b/.github/workflows/gpu_test.yml
index 467f67d428..e0be897532 100644
--- a/.github/workflows/gpu_test.yml
+++ b/.github/workflows/gpu_test.yml
@@ -11,7 +11,6 @@ on:
   workflow_dispatch:
 
 env:
-  PATH: /usr/local/cuda/bin
   LD_LIBRARY_PATH: /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64
 
 concurrency:

From 3852c9f06ea9a67acff6318898ecea3d6cfe4054 Mon Sep 17 00:00:00 2001
From: Akshay Subramaniam <6964110+akshaysubr@users.noreply.github.com>
Date: Thu, 29 Aug 2024 09:44:12 -0700
Subject: [PATCH 46/46] Fixing issue from merge conflict

---
 tests/v3/test_group.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v3/test_group.py b/tests/v3/test_group.py
index f959db251b..7592c77201 100644
--- a/tests/v3/test_group.py
+++ b/tests/v3/test_group.py
@@ -109,7 +109,7 @@ def test_group_members(store: MemoryStore | LocalStore, zarr_format: ZarrFormat)
             default_buffer_prototype().buffer.from_bytes(b"000000"),
         )
     )
-    members_observed = group.members
+    members_observed = group.members()
     # members are not guaranteed to be ordered, so sort before comparing
     assert sorted(dict(members_observed)) == sorted(members_expected)