From 5483956ec93b0b1ed480c10db3fca3a2a834e587 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 2 Jan 2025 16:37:02 +0100 Subject: [PATCH 01/31] add creation from other zarr --- src/zarr/api/asynchronous.py | 17 +++++++++++++ src/zarr/api/synchronous.py | 2 +- tests/test_array.py | 47 ++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index c4d1ec8627..a4e081c23f 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -18,6 +18,8 @@ ChunkCoords, MemoryOrder, ZarrFormat, + parse_dtype, + concurrent_map, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, parse_dtype, @@ -551,6 +553,21 @@ async def array( The new array. """ + if isinstance(data, Array): + chunks = kwargs.pop("chunks", None) or data.chunks + new_array = await create(shape=data.shape, chunks=chunks, dtype=data.dtype, **kwargs) + + async def _copy_chunk(chunk_coords: ChunkCoords) -> None: + await new_array.setitem(chunk_coords, await data._async_array.getitem(chunk_coords)) + + # Stream data from the source array to the new array + await concurrent_map( + [(region,) for region in data._iter_chunk_regions()], + _copy_chunk, + config.get("async.concurrency"), + ) + return new_array + # ensure data is array-like if not hasattr(data, "shape") or not hasattr(data, "dtype"): data = np.asanyarray(data) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index cd1ef8b38d..e0dbb1f5ad 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -339,7 +339,7 @@ def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> An # TODO: add type annotations for kwargs -def array(data: npt.ArrayLike, **kwargs: Any) -> Array: +def array(data: npt.ArrayLike | Array, **kwargs: Any) -> Array: """Create an array filled with `data`. Parameters diff --git a/tests/test_array.py b/tests/test_array.py index 891538bc43..eb837b7659 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2,6 +2,7 @@ import json import math import pickle +import time from itertools import accumulate from typing import Any, Literal @@ -865,6 +866,52 @@ async def test_special_complex_fill_values_roundtrip(fill_value: Any, expected: assert actual["fill_value"] == expected +async def test_creation_from_other_zarr(tmpdir): + src = zarr.zeros( + (2000, 20000), chunks=(1000, 1000), dtype="uint8", store=LocalStore(str(tmpdir)) + ) + src[:] = 1 + for _i in range(10): + start_time = time.time() + c = zarr.array(src, store=MemoryStore()) + end_time = time.time() + print(f"Time fast: {end_time - start_time} seconds") + + start_time = time.time() + b = zarr.zeros(src.shape, chunks=src.chunks, store=MemoryStore()) + b[:] = src[:] + end_time = time.time() + print(f"Time slow: {end_time - start_time} seconds") + + assert b[123, 123] == 1 + assert c[123, 123] == 1 + + +@pytest.mark.parametrize("shape", [(1,), (2, 3), (4, 5, 6)]) +@pytest.mark.parametrize("dtype", ["uint8", "float32"]) +@pytest.mark.parametrize("array_type", ["async", "sync"]) +async def test_nbytes( + shape: tuple[int, ...], dtype: str, array_type: Literal["async", "sync"] +) -> None: + """ + Test that the ``nbytes`` attribute of an Array or AsyncArray correctly reports the capacity of + the chunks of that array. + """ + store = MemoryStore() + arr = Array.create(store=store, shape=shape, dtype=dtype, fill_value=0) + if array_type == "async": + assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize + else: + assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize + + +async def test_scalar_array() -> None: + arr = zarr.array(1.5) + assert arr[...] == 1.5 + assert arr[()] == 1.5 + assert arr.shape == () + + @pytest.mark.parametrize("shape", [(1,), (2, 3), (4, 5, 6)]) @pytest.mark.parametrize("dtype", ["uint8", "float32"]) @pytest.mark.parametrize("array_type", ["async", "sync"]) From 9a32d1f5339964bb8c5585f7b7b6282d75f9616b Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 2 Jan 2025 16:53:39 +0100 Subject: [PATCH 02/31] remove duplicated tests --- src/zarr/api/asynchronous.py | 3 +- tests/test_array.py | 59 +++++++++++------------------------- 2 files changed, 18 insertions(+), 44 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index a4e081c23f..b064916d30 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -18,10 +18,9 @@ ChunkCoords, MemoryOrder, ZarrFormat, - parse_dtype, - concurrent_map, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, + concurrent_map, parse_dtype, ) from zarr.core.config import config diff --git a/tests/test_array.py b/tests/test_array.py index eb837b7659..d203f99268 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -866,27 +866,6 @@ async def test_special_complex_fill_values_roundtrip(fill_value: Any, expected: assert actual["fill_value"] == expected -async def test_creation_from_other_zarr(tmpdir): - src = zarr.zeros( - (2000, 20000), chunks=(1000, 1000), dtype="uint8", store=LocalStore(str(tmpdir)) - ) - src[:] = 1 - for _i in range(10): - start_time = time.time() - c = zarr.array(src, store=MemoryStore()) - end_time = time.time() - print(f"Time fast: {end_time - start_time} seconds") - - start_time = time.time() - b = zarr.zeros(src.shape, chunks=src.chunks, store=MemoryStore()) - b[:] = src[:] - end_time = time.time() - print(f"Time slow: {end_time - start_time} seconds") - - assert b[123, 123] == 1 - assert c[123, 123] == 1 - - @pytest.mark.parametrize("shape", [(1,), (2, 3), (4, 5, 6)]) @pytest.mark.parametrize("dtype", ["uint8", "float32"]) @pytest.mark.parametrize("array_type", ["async", "sync"]) @@ -912,26 +891,22 @@ async def test_scalar_array() -> None: assert arr.shape == () -@pytest.mark.parametrize("shape", [(1,), (2, 3), (4, 5, 6)]) -@pytest.mark.parametrize("dtype", ["uint8", "float32"]) -@pytest.mark.parametrize("array_type", ["async", "sync"]) -async def test_nbytes( - shape: tuple[int, ...], dtype: str, array_type: Literal["async", "sync"] -) -> None: - """ - Test that the ``nbytes`` attribute of an Array or AsyncArray correctly reports the capacity of - the chunks of that array. - """ - store = MemoryStore() - arr = Array.create(store=store, shape=shape, dtype=dtype, fill_value=0) - if array_type == "async": - assert arr._async_array.nbytes == np.prod(arr.shape) * arr.dtype.itemsize - else: - assert arr.nbytes == np.prod(arr.shape) * arr.dtype.itemsize +async def test_creation_from_other_zarr(tmpdir): + src = zarr.zeros( + (2000, 20000), chunks=(1000, 1000), dtype="uint8", store=LocalStore(str(tmpdir)) + ) + src[:] = 1 + for _i in range(10): + start_time = time.time() + c = zarr.array(src, store=MemoryStore()) + end_time = time.time() + print(f"Time fast: {end_time - start_time} seconds") + start_time = time.time() + b = zarr.zeros(src.shape, chunks=src.chunks, store=MemoryStore()) + b[:] = src[:] + end_time = time.time() + print(f"Time slow: {end_time - start_time} seconds") -async def test_scalar_array() -> None: - arr = zarr.array(1.5) - assert arr[...] == 1.5 - assert arr[()] == 1.5 - assert arr.shape == () + assert b[123, 123] == 1 + assert c[123, 123] == 1 From 2c19072045c6644c048f90901e0cc5abff6866b0 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 2 Jan 2025 18:14:21 +0100 Subject: [PATCH 03/31] improve test --- src/zarr/api/asynchronous.py | 15 +++++++++--- tests/test_array.py | 47 ++++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 22 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index b064916d30..834f87bc20 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -27,6 +27,7 @@ from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_filters_and_compressor +from zarr.core.sync import sync from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -535,7 +536,7 @@ async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None = async def array( - data: npt.ArrayLike, **kwargs: Any + data: npt.ArrayLike | Array, **kwargs: Any ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: """Create an array filled with `data`. @@ -553,11 +554,17 @@ async def array( """ if isinstance(data, Array): - chunks = kwargs.pop("chunks", None) or data.chunks - new_array = await create(shape=data.shape, chunks=chunks, dtype=data.dtype, **kwargs) + # fill missing arguments with metadata of data Array + kwargs.setdefault("dtype", data.dtype) + kwargs.setdefault("attributes", data.attrs) + kwargs.setdefault("chunks", data.chunks) + kwargs.setdefault("fill_value", data.fill_value) + + new_array = await create(data.shape, **kwargs) async def _copy_chunk(chunk_coords: ChunkCoords) -> None: - await new_array.setitem(chunk_coords, await data._async_array.getitem(chunk_coords)) + arr = await data._async_array.getitem(chunk_coords) + await new_array.setitem(chunk_coords, arr) # Stream data from the source array to the new array await concurrent_map( diff --git a/tests/test_array.py b/tests/test_array.py index d203f99268..03effea446 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -10,6 +10,8 @@ import numpy as np import pytest from numcodecs import Zstd +from numpy import dtype +from numpy.ma.testutils import assert_array_equal import zarr.api.asynchronous from zarr import Array, AsyncArray, Group @@ -892,21 +894,30 @@ async def test_scalar_array() -> None: async def test_creation_from_other_zarr(tmpdir): - src = zarr.zeros( - (2000, 20000), chunks=(1000, 1000), dtype="uint8", store=LocalStore(str(tmpdir)) - ) - src[:] = 1 - for _i in range(10): - start_time = time.time() - c = zarr.array(src, store=MemoryStore()) - end_time = time.time() - print(f"Time fast: {end_time - start_time} seconds") - - start_time = time.time() - b = zarr.zeros(src.shape, chunks=src.chunks, store=MemoryStore()) - b[:] = src[:] - end_time = time.time() - print(f"Time slow: {end_time - start_time} seconds") - - assert b[123, 123] == 1 - assert c[123, 123] == 1 + src_fill_value = 2 + src_dtype = np.dtype("uint8") + src_attributes = {} + src_chunks = (2, 2) + + src = zarr.create((10, 10), chunks=src_chunks, dtype=src_dtype, store=LocalStore(str(tmpdir)), fill_value = src_fill_value, attributes=src_attributes) + src[:] = np.arange(100).reshape((10,10)) + + result = zarr.array(src, store=MemoryStore()) + assert_array_equal(result[:], src[:]) + assert result.fill_value == src_fill_value + assert result.dtype==src_dtype + assert result.attrs.asdict() == src_attributes + assert result.chunks == src_chunks + + new_fill_value = 3 + new_dtype = np.dtype("uint16") + new_attributes = {"foo":"bar"} + new_chunks = (5, 10) + + result2 = zarr.array(src, store=MemoryStore(), chunks=new_chunks, dtype=new_dtype, fill_value = new_fill_value, attributes=new_attributes) + + assert_array_equal(result2[:], src[:]) + assert result2.fill_value == new_fill_value + assert result2.dtype == new_dtype + assert result2.attrs == new_attributes + assert result2.chunks == new_chunks \ No newline at end of file From 91152be3bf063cc94b391b57c8662416f3a0fcda Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 2 Jan 2025 18:15:52 +0100 Subject: [PATCH 04/31] test_iter_grid for non-squares --- tests/test_indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 04eb53e364..19198e0343 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1872,7 +1872,7 @@ def test_iter_grid( """ Test that iter_grid works as expected for 1, 2, and 3 dimensions. """ - grid_shape = (5,) * ndim + grid_shape = (10,2,7)[:ndim] if origin_0d is not None: origin_kwarg = origin_0d * ndim From 3c5ec3fdfa1626e4e15174e1022dffe303d86ff5 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 14:50:14 +0100 Subject: [PATCH 05/31] concurrent streaming for equal chunk sizes --- src/zarr/api/asynchronous.py | 18 ++++++++------- tests/conftest.py | 6 +++++ tests/test_array.py | 43 +++++++++++++++++++++--------------- tests/test_indexing.py | 12 +++++++++- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 834f87bc20..9c84347f52 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -27,7 +27,6 @@ from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_filters_and_compressor -from zarr.core.sync import sync from zarr.errors import NodeTypeValidationError from zarr.storage import ( StoreLike, @@ -562,16 +561,19 @@ async def array( new_array = await create(data.shape, **kwargs) - async def _copy_chunk(chunk_coords: ChunkCoords) -> None: + async def _copy_chunk(chunk_coords: ChunkCoords|slice) -> None: arr = await data._async_array.getitem(chunk_coords) await new_array.setitem(chunk_coords, arr) - # Stream data from the source array to the new array - await concurrent_map( - [(region,) for region in data._iter_chunk_regions()], - _copy_chunk, - config.get("async.concurrency"), - ) + if new_array.chunks == data.chunks: + # Stream data from the source array to the new array + await concurrent_map( + [(region,) for region in data._iter_chunk_regions()], + _copy_chunk, + config.get("async.concurrency"), + ) + else: + await _copy_chunk(slice(None)) return new_array # ensure data is array-like diff --git a/tests/conftest.py b/tests/conftest.py index ee31d0d071..348635f8aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -76,6 +76,12 @@ async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: return await parse_store(param, str(tmpdir)) +@pytest.fixture +async def store2(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: + param = request.param + return await parse_store(param, str(tmpdir / "store2")) + + @pytest.fixture(params=["local", "memory", "zip"]) def sync_store(request: pytest.FixtureRequest, tmp_path: LEGACY_PATH) -> Store: result = sync(parse_store(request.param, str(tmp_path))) diff --git a/tests/test_array.py b/tests/test_array.py index 03effea446..25f3ed9efe 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -2,7 +2,6 @@ import json import math import pickle -import time from itertools import accumulate from typing import Any, Literal @@ -10,7 +9,6 @@ import numpy as np import pytest from numcodecs import Zstd -from numpy import dtype from numpy.ma.testutils import assert_array_equal import zarr.api.asynchronous @@ -893,31 +891,40 @@ async def test_scalar_array() -> None: assert arr.shape == () -async def test_creation_from_other_zarr(tmpdir): +@pytest.mark.parametrize("store", ["memory", "local"], indirect=True) +@pytest.mark.parametrize("store2", ["memory", "local"], indirect=True) +@pytest.mark.parametrize("src_chunks", [(1, 2), (5, 5), (5, 10)]) +@pytest.mark.parametrize("new_chunks", [(1, 2), (5, 5), (5, 10)]) +async def test_creation_from_other_zarr(store, store2, src_chunks, new_chunks): src_fill_value = 2 src_dtype = np.dtype("uint8") src_attributes = {} - src_chunks = (2, 2) - src = zarr.create((10, 10), chunks=src_chunks, dtype=src_dtype, store=LocalStore(str(tmpdir)), fill_value = src_fill_value, attributes=src_attributes) - src[:] = np.arange(100).reshape((10,10)) - - result = zarr.array(src, store=MemoryStore()) - assert_array_equal(result[:], src[:]) - assert result.fill_value == src_fill_value - assert result.dtype==src_dtype - assert result.attrs.asdict() == src_attributes - assert result.chunks == src_chunks + src = zarr.create( + (10, 10), + chunks=src_chunks, + dtype=src_dtype, + store=store, + fill_value=src_fill_value, + attributes=src_attributes, + ) + src[:] = np.arange(100).reshape((10, 10)) new_fill_value = 3 new_dtype = np.dtype("uint16") - new_attributes = {"foo":"bar"} - new_chunks = (5, 10) - - result2 = zarr.array(src, store=MemoryStore(), chunks=new_chunks, dtype=new_dtype, fill_value = new_fill_value, attributes=new_attributes) + new_attributes = {"foo": "bar"} + + result2 = zarr.array( + src, + store=store2, + chunks=new_chunks, + dtype=new_dtype, + fill_value=new_fill_value, + attributes=new_attributes, + ) assert_array_equal(result2[:], src[:]) assert result2.fill_value == new_fill_value assert result2.dtype == new_dtype assert result2.attrs == new_attributes - assert result2.chunks == new_chunks \ No newline at end of file + assert result2.chunks == new_chunks diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 19198e0343..40e5c43dad 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1872,7 +1872,7 @@ def test_iter_grid( """ Test that iter_grid works as expected for 1, 2, and 3 dimensions. """ - grid_shape = (10,2,7)[:ndim] + grid_shape = (10, 2, 7)[:ndim] if origin_0d is not None: origin_kwarg = origin_0d * ndim @@ -1954,3 +1954,13 @@ def test_vectorized_indexing_incompatible_shape(store) -> None: ) with pytest.raises(ValueError, match="Attempting to set"): arr[np.array([1, 2]), np.array([1, 2])] = np.array([[-1, -2], [-3, -4]]) + + +def test_iter_chunk_regions(): + chunks = (2, 3) + a = zarr.create((10, 10), chunks=chunks) + a[:] = 1 + for region in a._iter_chunk_regions(): + assert_array_equal(a[region], np.ones_like(a[region])) + a[region] = 0 + assert_array_equal(a[region], np.zeros_like(a[region])) From 79a45b13fedf7b4e7c18ca3d075171060e927532 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 14:52:06 +0100 Subject: [PATCH 06/31] fix merge --- src/zarr/api/asynchronous.py | 3 ++- tests/test_array.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index f145ecb377..7025f38c7e 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,6 +9,7 @@ import numpy.typing as npt from typing_extensions import deprecated +from zarr import config from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike @@ -554,7 +555,7 @@ async def array( new_array = await create(data.shape, **kwargs) - async def _copy_chunk(chunk_coords: ChunkCoords|slice) -> None: + async def _copy_chunk(chunk_coords: ChunkCoords | slice) -> None: arr = await data._async_array.getitem(chunk_coords) await new_array.setitem(chunk_coords, arr) diff --git a/tests/test_array.py b/tests/test_array.py index cd3cf53e6b..9e8267a09a 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -9,7 +9,6 @@ import numcodecs import numpy as np import pytest -from numcodecs import Zstd from numpy.ma.testutils import assert_array_equal import zarr.api.asynchronous From da2f03f2b18df31fb014c68f9e6009592619e7b6 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 15:04:58 +0100 Subject: [PATCH 07/31] fix mypy --- src/zarr/api/asynchronous.py | 2 +- tests/conftest.py | 3 ++- tests/test_array.py | 14 ++++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 7025f38c7e..ab0115bd62 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,7 +9,6 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr import config from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike @@ -25,6 +24,7 @@ concurrent_map, parse_dtype, ) +from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_compressor, _default_filters diff --git a/tests/conftest.py b/tests/conftest.py index 63325c4d31..56abf5f76e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -78,7 +78,8 @@ async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: @pytest.fixture async def store2(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store: param = request.param - return await parse_store(param, str(tmpdir / "store2")) + store2_path = tmpdir.mkdir("store2") + return await parse_store(param, str(store2_path)) @pytest.fixture(params=["local", "memory", "zip"]) diff --git a/tests/test_array.py b/tests/test_array.py index 9e8267a09a..a7ebd226c3 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -9,10 +9,10 @@ import numcodecs import numpy as np import pytest -from numpy.ma.testutils import assert_array_equal import zarr.api.asynchronous from zarr import Array, AsyncArray, Group +from zarr.abc.store import Store from zarr.codecs import ( BytesCodec, GzipCodec, @@ -1265,14 +1265,16 @@ async def test_scalar_array() -> None: assert arr.shape == () -@pytest.mark.parametrize("store", ["memory", "local"], indirect=True) -@pytest.mark.parametrize("store2", ["memory", "local"], indirect=True) +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) +@pytest.mark.parametrize("store2", ["local", "memory", "zip"], indirect=["store2"]) @pytest.mark.parametrize("src_chunks", [(1, 2), (5, 5), (5, 10)]) @pytest.mark.parametrize("new_chunks", [(1, 2), (5, 5), (5, 10)]) -async def test_creation_from_other_zarr(store, store2, src_chunks, new_chunks): +async def test_creation_from_other_zarr( + store: Store, store2: Store, src_chunks: tuple[int, int], new_chunks: tuple[int, int] +) -> None: src_fill_value = 2 src_dtype = np.dtype("uint8") - src_attributes = {} + src_attributes = None src = zarr.create( (10, 10), @@ -1297,7 +1299,7 @@ async def test_creation_from_other_zarr(store, store2, src_chunks, new_chunks): attributes=new_attributes, ) - assert_array_equal(result2[:], src[:]) + np.testing.assert_array_equal(result2[:], src[:]) assert result2.fill_value == new_fill_value assert result2.dtype == new_dtype assert result2.attrs == new_attributes From 7728d7f5a3969064a897a6a3e3f68dcfcb61ac0e Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 15:14:26 +0100 Subject: [PATCH 08/31] fix mypy --- src/zarr/api/asynchronous.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index ab0115bd62..6ac2598211 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -555,19 +555,19 @@ async def array( new_array = await create(data.shape, **kwargs) - async def _copy_chunk(chunk_coords: ChunkCoords | slice) -> None: - arr = await data._async_array.getitem(chunk_coords) + async def _copy_chunk(chunk_coords: ChunkCoords | slice, _data: Array) -> None: + arr = await _data._async_array.getitem(chunk_coords) await new_array.setitem(chunk_coords, arr) if new_array.chunks == data.chunks: # Stream data from the source array to the new array await concurrent_map( - [(region,) for region in data._iter_chunk_regions()], + [(region,data) for region in data._iter_chunk_regions()], _copy_chunk, config.get("async.concurrency"), ) else: - await _copy_chunk(slice(None)) + await _copy_chunk(slice(None), data) return new_array # ensure data is array-like From 2df18a0d4cc0251b663a97d22f46827a026d6eda Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 15:42:40 +0100 Subject: [PATCH 09/31] fix test_iter_grid --- src/zarr/api/asynchronous.py | 2 +- tests/test_indexing.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 6ac2598211..5a73e88c42 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -562,7 +562,7 @@ async def _copy_chunk(chunk_coords: ChunkCoords | slice, _data: Array) -> None: if new_array.chunks == data.chunks: # Stream data from the source array to the new array await concurrent_map( - [(region,data) for region in data._iter_chunk_regions()], + [(region, data) for region in data._iter_chunk_regions()], _copy_chunk, config.get("async.concurrency"), ) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 8527adfaca..11d913a685 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1871,7 +1871,7 @@ def test_iter_grid( """ Test that iter_grid works as expected for 1, 2, and 3 dimensions. """ - grid_shape = (10, 2, 7)[:ndim] + grid_shape = (10, 5, 7)[:ndim] if origin_0d is not None: origin_kwarg = origin_0d * ndim From 03e25006fe3292b19a422d73903042a8b0a5b39e Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 17:47:20 +0100 Subject: [PATCH 10/31] extract to zarr.from_array --- src/zarr/__init__.py | 2 + src/zarr/api/asynchronous.py | 28 +----- src/zarr/api/synchronous.py | 133 +++++++++++++++++++++++++++ src/zarr/core/array.py | 169 +++++++++++++++++++++++++++++++++++ tests/test_array.py | 60 +++++++++---- 5 files changed, 352 insertions(+), 40 deletions(-) diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index bcbdaf7c19..2e3293dbbf 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -10,6 +10,7 @@ create_group, empty, empty_like, + from_array, full, full_like, group, @@ -52,6 +53,7 @@ "create_group", "empty", "empty_like", + "from_array", "full", "full_like", "group", diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 5a73e88c42..32f4de965b 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -9,7 +9,7 @@ import numpy.typing as npt from typing_extensions import deprecated -from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata +from zarr.core.array import Array, AsyncArray, create_array, from_array, get_array_metadata from zarr.core.array_spec import ArrayConfig, ArrayConfigLike from zarr.core.buffer import NDArrayLike from zarr.core.common import ( @@ -21,10 +21,8 @@ _default_zarr_format, _warn_order_kwarg, _warn_write_empty_chunks_kwarg, - concurrent_map, parse_dtype, ) -from zarr.core.config import config from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata from zarr.core.metadata.v2 import _default_compressor, _default_filters @@ -52,6 +50,7 @@ "create_array", "empty", "empty_like", + "from_array", "full", "full_like", "group", @@ -547,28 +546,7 @@ async def array( """ if isinstance(data, Array): - # fill missing arguments with metadata of data Array - kwargs.setdefault("dtype", data.dtype) - kwargs.setdefault("attributes", data.attrs) - kwargs.setdefault("chunks", data.chunks) - kwargs.setdefault("fill_value", data.fill_value) - - new_array = await create(data.shape, **kwargs) - - async def _copy_chunk(chunk_coords: ChunkCoords | slice, _data: Array) -> None: - arr = await _data._async_array.getitem(chunk_coords) - await new_array.setitem(chunk_coords, arr) - - if new_array.chunks == data.chunks: - # Stream data from the source array to the new array - await concurrent_map( - [(region, data) for region in data._iter_chunk_regions()], - _copy_chunk, - config.get("async.concurrency"), - ) - else: - await _copy_chunk(slice(None), data) - return new_array + return await from_array(data, **kwargs) # ensure data is array-like if not hasattr(data, "shape") or not hasattr(data, "dtype"): diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 2a2bccc9a0..7acd225c3e 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -47,6 +47,7 @@ "create_array", "empty", "empty_like", + "from_array", "full", "full_like", "group", @@ -893,6 +894,138 @@ def create_array( ) +def from_array( + data: Array, + store: str | StoreLike, + *, + name: str | None = None, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + serializer: SerializerLike = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, +) -> Array: + """Create an array from an existing array. + + Parameters + ---------- + data : Array + The array to copy. + store : str or Store + Store or path to directory in file system or name of zip file for the new array. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, defaults to the chunk shape of the data array. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + + If no ``filters`` are provided, defaults to the filters of the data array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. + + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. + + If no ``compressors`` are provided, defaults to the compressors of the data array. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. + + If no ``serializer`` is provided, defaults to the serializer of the input array. + fill_value : Any, optional + Fill value for the array. + If not specified, defaults to the fill value of the data array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If not specified, defaults to the memory order of the data array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + If not specified, defaults to the zarr format of the data array. + attributes : dict, optional + Attributes for the array. + If not specified, defaults to the attributes of the data array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. + If not specified and the data array has the same zarr format as the target array, + the chunk key encoding of the data array is used. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. + If not specified, defaults to the dimension names of the data array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + AsyncArray + The array. + + Examples + -------- + #TODO + """ + return Array( + sync( + zarr.core.array.from_array( + data, + store, + name=name, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=serializer, + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, + ) + ) + ) + + # TODO: add type annotations for kwargs def empty(shape: ChunkCoords, **kwargs: Any) -> Array: """Create an empty array. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ea29a6fc48..29bb57d362 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -25,6 +25,7 @@ import numpy.typing as npt from typing_extensions import deprecated +import zarr from zarr._compat import _deprecate_positional_args from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec, Codec from zarr.abc.store import Store, set_or_delete @@ -3734,6 +3735,174 @@ class ShardsConfigParam(TypedDict): ShardsLike: TypeAlias = ChunkCoords | ShardsConfigParam | Literal["auto"] +async def from_array( + data: Array, + store: str | StoreLike, + *, + name: str | None = None, + chunks: ChunkCoords | Literal["auto"] = "auto", + shards: ShardsLike | None = None, + filters: FiltersLike = "auto", + compressors: CompressorsLike = "auto", + serializer: SerializerLike = "auto", + fill_value: Any | None = None, + order: MemoryOrder | None = None, + zarr_format: ZarrFormat | None = 3, + attributes: dict[str, JSON] | None = None, + chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, + dimension_names: Iterable[str] | None = None, + storage_options: dict[str, Any] | None = None, + overwrite: bool = False, + config: ArrayConfig | ArrayConfigLike | None = None, +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """Create an array from an existing array. + + Parameters + ---------- + data : Array + The array to copy. + store : str or Store + Store or path to directory in file system or name of zip file for the new array. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. + chunks : ChunkCoords, optional + Chunk shape of the array. + If not specified, defaults to the chunk shape of the data array. + shards : ChunkCoords, optional + Shard shape of the array. The default value of ``None`` results in no sharding at all. + filters : Iterable[Codec], optional + Iterable of filters to apply to each chunk of the array, in order, before serializing that + chunk to bytes. + + For Zarr format 3, a "filter" is a codec that takes an array and returns an array, + and these values must be instances of ``ArrayArrayCodec``, or dict representations + of ``ArrayArrayCodec``. + + For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the + the order if your filters is consistent with the behavior of each filter. + + If no ``filters`` are provided, defaults to the filters of the data array. + compressors : Iterable[Codec], optional + List of compressors to apply to the array. Compressors are applied in order, and after any + filters are applied (if any are specified) and the data is serialized into bytes. + + For Zarr format 3, a "compressor" is a codec that takes a bytestream, and + returns another bytestream. Multiple compressors my be provided for Zarr format 3. + + For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may + be provided for Zarr format 2. + + If no ``compressors`` are provided, defaults to the compressors of the data array. + serializer : dict[str, JSON] | ArrayBytesCodec, optional + Array-to-bytes codec to use for encoding the array data. + Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. + + If no ``serializer`` is provided, defaults to the serializer of the input array. + fill_value : Any, optional + Fill value for the array. + If not specified, defaults to the fill value of the data array. + order : {"C", "F"}, optional + The memory of the array (default is "C"). + For Zarr format 2, this parameter sets the memory order of the array. + For Zarr format 3, this parameter is deprecated, because memory order + is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory + order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``. + If not specified, defaults to the memory order of the data array. + zarr_format : {2, 3}, optional + The zarr format to use when saving. + If not specified, defaults to the zarr format of the data array. + attributes : dict, optional + Attributes for the array. + If not specified, defaults to the attributes of the data array. + chunk_key_encoding : ChunkKeyEncoding, optional + A specification of how the chunk keys are represented in storage. + For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``. + For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``. + If not specified and the data array has the same zarr format as the target array, + the chunk key encoding of the data array is used. + dimension_names : Iterable[str], optional + The names of the dimensions (default is None). + Zarr format 3 only. Zarr format 2 arrays should not use this parameter. + If not specified, defaults to the dimension names of the data array. + storage_options : dict, optional + If using an fsspec URL to create the store, these will be passed to the backend implementation. + Ignored otherwise. + overwrite : bool, default False + Whether to overwrite an array with the same name in the store, if one exists. + config : ArrayConfig or ArrayConfigLike, optional + Runtime configuration for the array. + + Returns + ------- + AsyncArray + The array. + + Examples + -------- + #TODO + """ + + # fill missing arguments with metadata of data Array + if chunks == "auto": + chunks = data.chunks + if filters is None: + filters = data.filters + if compressors is None: + compressors = data.compressors + if serializer is None: + serializer = data.serializer + if fill_value is None: + fill_value = data.fill_value + if order is None: + order = data.order + if zarr_format is None: + zarr_format = data.metadata.zarr_format + if chunk_key_encoding is None and zarr_format == data.metadata.zarr_format: + if data.metadata.zarr_format == 2: + chunk_key_encoding = {"name": "v2", "separator": data.metadata.dimension_separator} + else: + chunk_key_encoding = data.metadata.chunk_key_encoding + if dimension_names is None and data.metadata.zarr_format == 3: + dimension_names = data.metadata.dimension_names + + new_array = await create_array( + store, + name=name, + shape=data.shape, + dtype=data.dtype, + chunks=chunks, + shards=shards, + filters=filters, + compressors=compressors, + serializer=(serializer or "auto"), + fill_value=fill_value, + order=order, + zarr_format=zarr_format, + attributes=attributes, + chunk_key_encoding=chunk_key_encoding, + dimension_names=dimension_names, + storage_options=storage_options, + overwrite=overwrite, + config=config, + ) + + async def _copy_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: + arr = await _data._async_array.getitem(chunk_coords) + await new_array.setitem(chunk_coords, arr) + + if new_array.chunks == data.chunks: + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in data._iter_chunk_regions()], + _copy_region, + zarr.core.config.config.get("async.concurrency"), + ) + else: + await _copy_region(slice(None), data) + return new_array + + async def create_array( store: str | StoreLike, *, diff --git a/tests/test_array.py b/tests/test_array.py index a7ebd226c3..fd2c01709e 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1265,42 +1265,72 @@ async def test_scalar_array() -> None: assert arr.shape == () -@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"]) +@pytest.mark.parametrize("store", ["local"], indirect=True) +@pytest.mark.parametrize("store2", ["local"], indirect=["store2"]) +@pytest.mark.parametrize("src_format", [2, 3]) +@pytest.mark.parametrize("new_format", [2, 3]) +async def test_creation_from_other_zarr_format( + store: Store, + store2: Store, + src_format: ZarrFormat, + new_format: ZarrFormat, +) -> None: + src = zarr.create( + (50, 50), + chunks=(10, 10), + store=store, + zarr_format=src_format, + ) + src[:] = np.arange(50 * 50).reshape((50, 50)) + result = zarr.from_array( + src, + store=store2, + zarr_format=new_format, + ) + np.testing.assert_array_equal(result[:], src[:]) + assert result.fill_value == src.fill_value + assert result.dtype == src.dtype + assert result.chunks == src.chunks + + +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) @pytest.mark.parametrize("store2", ["local", "memory", "zip"], indirect=["store2"]) -@pytest.mark.parametrize("src_chunks", [(1, 2), (5, 5), (5, 10)]) -@pytest.mark.parametrize("new_chunks", [(1, 2), (5, 5), (5, 10)]) -async def test_creation_from_other_zarr( - store: Store, store2: Store, src_chunks: tuple[int, int], new_chunks: tuple[int, int] +@pytest.mark.parametrize("src_chunks", [(10, 2), (50, 10)]) +@pytest.mark.parametrize("new_chunks", [(10, 2), (50, 10)]) +async def test_from_array( + store: Store, + store2: Store, + src_chunks: tuple[int, int], + new_chunks: tuple[int, int], + zarr_format: ZarrFormat, ) -> None: src_fill_value = 2 src_dtype = np.dtype("uint8") src_attributes = None src = zarr.create( - (10, 10), + (100, 10), chunks=src_chunks, dtype=src_dtype, store=store, fill_value=src_fill_value, attributes=src_attributes, ) - src[:] = np.arange(100).reshape((10, 10)) + src[:] = np.arange(1000).reshape((100, 10)) new_fill_value = 3 - new_dtype = np.dtype("uint16") new_attributes = {"foo": "bar"} - result2 = zarr.array( + result = zarr.array( src, store=store2, chunks=new_chunks, - dtype=new_dtype, fill_value=new_fill_value, attributes=new_attributes, ) - np.testing.assert_array_equal(result2[:], src[:]) - assert result2.fill_value == new_fill_value - assert result2.dtype == new_dtype - assert result2.attrs == new_attributes - assert result2.chunks == new_chunks + np.testing.assert_array_equal(result[:], src[:]) + assert result.fill_value == new_fill_value + assert result.dtype == src_dtype + assert result.attrs == new_attributes + assert result.chunks == new_chunks From f6ae2f8d33053d9f2b5a7e453735f0bc64fd06f1 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 18:13:23 +0100 Subject: [PATCH 11/31] fix mypy --- src/zarr/api/asynchronous.py | 2 +- src/zarr/core/array.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 32f4de965b..2a7635f392 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -554,7 +554,7 @@ async def array( # setup dtype kw_dtype = kwargs.get("dtype") - if kw_dtype is None: + if kw_dtype is None and hasattr(data, "dtype"): kwargs["dtype"] = data.dtype else: kwargs["dtype"] = kw_dtype diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 29bb57d362..733126da87 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3850,7 +3850,7 @@ async def from_array( filters = data.filters if compressors is None: compressors = data.compressors - if serializer is None: + if serializer is "auto": serializer = data.serializer if fill_value is None: fill_value = data.fill_value From 36146e5aa242b23a3267dc76c814ba22738afe88 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 8 Jan 2025 18:20:53 +0100 Subject: [PATCH 12/31] fix mypy --- src/zarr/core/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 733126da87..245c64470e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3850,7 +3850,7 @@ async def from_array( filters = data.filters if compressors is None: compressors = data.compressors - if serializer is "auto": + if serializer is "auto" and data.serializer is not None: serializer = data.serializer if fill_value is None: fill_value = data.fill_value From 085efe91082689c5661f4430fe4f5eea23d12569 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 9 Jan 2025 15:36:17 +0100 Subject: [PATCH 13/31] format --- src/zarr/core/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 245c64470e..8f4017836c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3850,8 +3850,8 @@ async def from_array( filters = data.filters if compressors is None: compressors = data.compressors - if serializer is "auto" and data.serializer is not None: - serializer = data.serializer + if serializer == "auto": + serializer = cast(SerializerLike, data.serializer) if fill_value is None: fill_value = data.fill_value if order is None: From 93ed8d6f4d50fdd625d6c235235d8845c3cb13cf Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Thu, 9 Jan 2025 17:01:13 +0100 Subject: [PATCH 14/31] fix test_creation_from_other_zarr_format --- src/zarr/core/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8f4017836c..ba4bfce613 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3850,14 +3850,14 @@ async def from_array( filters = data.filters if compressors is None: compressors = data.compressors - if serializer == "auto": - serializer = cast(SerializerLike, data.serializer) if fill_value is None: fill_value = data.fill_value if order is None: order = data.order if zarr_format is None: zarr_format = data.metadata.zarr_format + if zarr_format == 3 and serializer == "auto": + serializer = cast(SerializerLike, data.serializer) if chunk_key_encoding is None and zarr_format == data.metadata.zarr_format: if data.metadata.zarr_format == 2: chunk_key_encoding = {"name": "v2", "separator": data.metadata.dimension_separator} From e60710573b60a1dd1cd9bd0b29c252a781529ed3 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 11:40:02 +0100 Subject: [PATCH 15/31] distinguish between keep and auto for from_array arguments --- src/zarr/api/synchronous.py | 24 +++++++++++------ src/zarr/core/array.py | 54 ++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 7acd225c3e..7422d9406e 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -899,11 +899,11 @@ def from_array( store: str | StoreLike, *, name: str | None = None, - chunks: ChunkCoords | Literal["auto"] = "auto", + chunks: Literal["auto", "keep"] | ChunkCoords = "keep", shards: ShardsLike | None = None, - filters: FiltersLike = "auto", - compressors: CompressorsLike = "auto", - serializer: SerializerLike = "auto", + filters: FiltersLike | Literal["keep"] = "keep", + compressors: CompressorsLike | Literal["keep"] = "keep", + serializer: SerializerLike | Literal["keep"] = "keep", fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, @@ -925,12 +925,14 @@ def from_array( name : str or None, optional The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. - chunks : ChunkCoords, optional + chunks : ChunkCoords or "auto" or "keep", optional Chunk shape of the array. If not specified, defaults to the chunk shape of the data array. + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the input array. shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. - filters : Iterable[Codec], optional + filters : Iterable[Codec] or "auto" or "keep", optional Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. @@ -942,7 +944,9 @@ def from_array( the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, defaults to the filters of the data array. - compressors : Iterable[Codec], optional + - "auto": Automatically determine the filters based on the array's dtype. + - "keep": Retain the filters of the input array. + compressors : Iterable[Codec] or "auto" or "keep", optional List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -953,11 +957,15 @@ def from_array( be provided for Zarr format 2. If no ``compressors`` are provided, defaults to the compressors of the data array. - serializer : dict[str, JSON] | ArrayBytesCodec, optional + - "auto": Automatically determine the compressors based on the array's dtype. + - "keep": Retain the compressors of the input array. + serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, defaults to the serializer of the input array. + - "auto": Automatically determine the serializer based on the array's dtype. + - "keep": Retain the serializer of the input array. fill_value : Any, optional Fill value for the array. If not specified, defaults to the fill value of the data array. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ba4bfce613..f6f2a43633 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3740,11 +3740,11 @@ async def from_array( store: str | StoreLike, *, name: str | None = None, - chunks: ChunkCoords | Literal["auto"] = "auto", + chunks: Literal["auto", "keep"] | ChunkCoords = "keep", shards: ShardsLike | None = None, - filters: FiltersLike = "auto", - compressors: CompressorsLike = "auto", - serializer: SerializerLike = "auto", + filters: FiltersLike | Literal["keep"] = "keep", + compressors: CompressorsLike | Literal["keep"] = "keep", + serializer: SerializerLike | Literal["keep"] = "keep", fill_value: Any | None = None, order: MemoryOrder | None = None, zarr_format: ZarrFormat | None = 3, @@ -3766,12 +3766,14 @@ async def from_array( name : str or None, optional The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. - chunks : ChunkCoords, optional + chunks : ChunkCoords or "auto" or "keep", optional Chunk shape of the array. If not specified, defaults to the chunk shape of the data array. + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the input array. shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. - filters : Iterable[Codec], optional + filters : Iterable[Codec] or "auto" or "keep", optional Iterable of filters to apply to each chunk of the array, in order, before serializing that chunk to bytes. @@ -3783,7 +3785,9 @@ async def from_array( the order if your filters is consistent with the behavior of each filter. If no ``filters`` are provided, defaults to the filters of the data array. - compressors : Iterable[Codec], optional + - "auto": Automatically determine the filters based on the array's dtype. + - "keep": Retain the filters of the input array. + compressors : Iterable[Codec] or "auto" or "keep", optional List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3794,11 +3798,15 @@ async def from_array( be provided for Zarr format 2. If no ``compressors`` are provided, defaults to the compressors of the data array. - serializer : dict[str, JSON] | ArrayBytesCodec, optional + - "auto": Automatically determine the compressors based on the array's dtype. + - "keep": Retain the compressors of the input array. + serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. If no ``serializer`` is provided, defaults to the serializer of the input array. + - "auto": Automatically determine the serializer based on the array's dtype. + - "keep": Retain the serializer of the input array. fill_value : Any, optional Fill value for the array. If not specified, defaults to the fill value of the data array. @@ -3843,23 +3851,31 @@ async def from_array( #TODO """ - # fill missing arguments with metadata of data Array - if chunks == "auto": + if chunks == "keep": chunks = data.chunks - if filters is None: - filters = data.filters - if compressors is None: - compressors = data.compressors + if zarr_format is None: + zarr_format = data.metadata.zarr_format + if filters == "keep": + if zarr_format == data.metadata.zarr_format: + filters = data.filters + else: + filters = "auto" + if compressors == "keep": + if zarr_format == data.metadata.zarr_format: + compressors = data.compressors + else: + compressors = "auto" + if serializer == "keep": + if zarr_format == 3: + serializer = cast(SerializerLike, data.serializer) + else: + serializer = "auto" if fill_value is None: fill_value = data.fill_value if order is None: order = data.order - if zarr_format is None: - zarr_format = data.metadata.zarr_format - if zarr_format == 3 and serializer == "auto": - serializer = cast(SerializerLike, data.serializer) if chunk_key_encoding is None and zarr_format == data.metadata.zarr_format: - if data.metadata.zarr_format == 2: + if zarr_format == 2: chunk_key_encoding = {"name": "v2", "separator": data.metadata.dimension_separator} else: chunk_key_encoding = data.metadata.chunk_key_encoding From 7eb6988904e36421c04935a4b8ac8327c16684bc Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 13:28:50 +0100 Subject: [PATCH 16/31] partition concurrency along new_array chunks --- src/zarr/core/array.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index f6f2a43633..ee2aa6281d 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3907,15 +3907,12 @@ async def _copy_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: arr = await _data._async_array.getitem(chunk_coords) await new_array.setitem(chunk_coords, arr) - if new_array.chunks == data.chunks: - # Stream data from the source array to the new array - await concurrent_map( - [(region, data) for region in data._iter_chunk_regions()], - _copy_region, - zarr.core.config.config.get("async.concurrency"), - ) - else: - await _copy_region(slice(None), data) + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in new_array._iter_chunk_regions()], + _copy_region, + zarr.core.config.config.get("async.concurrency"), + ) return new_array From c7393a4130544f2e037d5be39c514e7c9ea1edbb Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 13:34:37 +0100 Subject: [PATCH 17/31] fix mypy --- src/zarr/core/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ee2aa6281d..1613f1a4e0 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3875,9 +3875,9 @@ async def from_array( if order is None: order = data.order if chunk_key_encoding is None and zarr_format == data.metadata.zarr_format: - if zarr_format == 2: + if isinstance(data.metadata, ArrayV2Metadata): chunk_key_encoding = {"name": "v2", "separator": data.metadata.dimension_separator} - else: + elif isinstance(data.metadata, ArrayV3Metadata): chunk_key_encoding = data.metadata.chunk_key_encoding if dimension_names is None and data.metadata.zarr_format == 3: dimension_names = data.metadata.dimension_names From 543099a2f21b16956d22a241973c1fe4f40091eb Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 13:55:01 +0100 Subject: [PATCH 18/31] improve test_creation_from_other_zarr_format --- src/zarr/core/array.py | 4 ++-- tests/test_array.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 1613f1a4e0..60565169ab 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3857,12 +3857,12 @@ async def from_array( zarr_format = data.metadata.zarr_format if filters == "keep": if zarr_format == data.metadata.zarr_format: - filters = data.filters + filters = data.filters or None else: filters = "auto" if compressors == "keep": if zarr_format == data.metadata.zarr_format: - compressors = data.compressors + compressors = data.compressors or None else: compressors = "auto" if serializer == "keep": diff --git a/tests/test_array.py b/tests/test_array.py index fd2c01709e..6455f2a53c 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1275,12 +1275,14 @@ async def test_creation_from_other_zarr_format( src_format: ZarrFormat, new_format: ZarrFormat, ) -> None: - src = zarr.create( - (50, 50), - chunks=(10, 10), - store=store, - zarr_format=src_format, - ) + kwargs = {} + # set dimension_separator to non default + if src_format == 2: + kwargs["dimension_separator"] = "/" + else: + kwargs["chunk_key_encoding"] = ("default", ".") + + src = zarr.create((50, 50), chunks=(10, 10), store=store, zarr_format=src_format, **kwargs) src[:] = np.arange(50 * 50).reshape((50, 50)) result = zarr.from_array( src, @@ -1291,6 +1293,8 @@ async def test_creation_from_other_zarr_format( assert result.fill_value == src.fill_value assert result.dtype == src.dtype assert result.chunks == src.chunks + if src_format == new_format: + assert result.metadata == src.metadata @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) From 73843dc974d0fb6bf0245ec412ba9fee3827b54c Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 14:01:20 +0100 Subject: [PATCH 19/31] add typing in test --- tests/test_array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_array.py b/tests/test_array.py index 6455f2a53c..b4f0d1fd40 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1275,7 +1275,7 @@ async def test_creation_from_other_zarr_format( src_format: ZarrFormat, new_format: ZarrFormat, ) -> None: - kwargs = {} + kwargs: dict[str, tuple[Literal["default"], Literal[".", "/"]] | Literal[".", "/"]] = {} # set dimension_separator to non default if src_format == 2: kwargs["dimension_separator"] = "/" From 0f0f81224fe283db4ffbdd28d8468843f2bd576e Mon Sep 17 00:00:00 2001 From: Hannes Spitz <44113112+brokkoli71@users.noreply.github.com> Date: Wed, 15 Jan 2025 14:09:46 +0100 Subject: [PATCH 20/31] Update src/zarr/core/array.py Co-authored-by: Norman Rzepka --- src/zarr/core/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 60565169ab..d4d88ce528 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3736,7 +3736,7 @@ class ShardsConfigParam(TypedDict): async def from_array( - data: Array, + data: Array | npt.ArrayLike, store: str | StoreLike, *, name: str | None = None, From 021ca95ffdbde197d34c5a6d439f6564cc9272c7 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 14:42:01 +0100 Subject: [PATCH 21/31] add from_array with npt.ArrayLike --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 97 +++++++++++++++++++++++-------------- tests/test_array.py | 10 ++++ 3 files changed, 71 insertions(+), 38 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 7422d9406e..77911b3e4e 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -895,7 +895,7 @@ def create_array( def from_array( - data: Array, + data: Array | npt.ArrayLike, store: str | StoreLike, *, name: str | None = None, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index d4d88ce528..b065eda9d1 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3851,37 +3851,48 @@ async def from_array( #TODO """ - if chunks == "keep": - chunks = data.chunks - if zarr_format is None: - zarr_format = data.metadata.zarr_format - if filters == "keep": - if zarr_format == data.metadata.zarr_format: - filters = data.filters or None - else: + if isinstance(data, Array): + if chunks == "keep": + chunks = data.chunks + if zarr_format is None: + zarr_format = data.metadata.zarr_format + if filters == "keep": + if zarr_format == data.metadata.zarr_format: + filters = data.filters or None + else: + filters = "auto" + if compressors == "keep": + if zarr_format == data.metadata.zarr_format: + compressors = data.compressors or None + else: + compressors = "auto" + if serializer == "keep": + if zarr_format == 3: + serializer = cast(SerializerLike, data.serializer) + else: + serializer = "auto" + if fill_value is None: + fill_value = data.fill_value + if order is None: + order = data.order + if chunk_key_encoding is None and zarr_format == data.metadata.zarr_format: + if isinstance(data.metadata, ArrayV2Metadata): + chunk_key_encoding = {"name": "v2", "separator": data.metadata.dimension_separator} + elif isinstance(data.metadata, ArrayV3Metadata): + chunk_key_encoding = data.metadata.chunk_key_encoding + if dimension_names is None and data.metadata.zarr_format == 3: + dimension_names = data.metadata.dimension_names + else: + if chunks == "keep": + chunks = "auto" + if zarr_format is None: + zarr_format = 3 + if filters == "keep": filters = "auto" - if compressors == "keep": - if zarr_format == data.metadata.zarr_format: - compressors = data.compressors or None - else: + if compressors == "keep": compressors = "auto" - if serializer == "keep": - if zarr_format == 3: - serializer = cast(SerializerLike, data.serializer) - else: + if serializer == "keep": serializer = "auto" - if fill_value is None: - fill_value = data.fill_value - if order is None: - order = data.order - if chunk_key_encoding is None and zarr_format == data.metadata.zarr_format: - if isinstance(data.metadata, ArrayV2Metadata): - chunk_key_encoding = {"name": "v2", "separator": data.metadata.dimension_separator} - elif isinstance(data.metadata, ArrayV3Metadata): - chunk_key_encoding = data.metadata.chunk_key_encoding - if dimension_names is None and data.metadata.zarr_format == 3: - dimension_names = data.metadata.dimension_names - new_array = await create_array( store, name=name, @@ -3902,17 +3913,29 @@ async def from_array( overwrite=overwrite, config=config, ) + if isinstance(data, Array): - async def _copy_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: - arr = await _data._async_array.getitem(chunk_coords) - await new_array.setitem(chunk_coords, arr) + async def _copy_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: + arr = await _data._async_array.getitem(chunk_coords) + await new_array.setitem(chunk_coords, arr) - # Stream data from the source array to the new array - await concurrent_map( - [(region, data) for region in new_array._iter_chunk_regions()], - _copy_region, - zarr.core.config.config.get("async.concurrency"), - ) + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in new_array._iter_chunk_regions()], + _copy_region, + zarr.core.config.config.get("async.concurrency"), + ) + else: + + async def _copy_region(chunk_coords: ChunkCoords | slice, _data: npt.ArrayLike) -> None: + await new_array.setitem(chunk_coords, _data[chunk_coords]) + + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in new_array._iter_chunk_regions()], + _copy_region, + zarr.core.config.config.get("async.concurrency"), + ) return new_array diff --git a/tests/test_array.py b/tests/test_array.py index b4f0d1fd40..cc09a82968 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1338,3 +1338,13 @@ async def test_from_array( assert result.dtype == src_dtype assert result.attrs == new_attributes assert result.chunks == new_chunks + + +@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) +@pytest.mark.parametrize("chunks", [(10, 2, 3), "keep", "auto"]) +async def test_from_numpy_array( + store: Store, chunks: Literal["auto", "keep"] | tuple[int, int] +) -> None: + src = np.arange(1000).reshape(10, 10, 10) + result = zarr.from_array(src, store=store, chunks=chunks) + np.testing.assert_array_equal(result[:], src) From 092a1e0fc2a6350f11e1447caeb67698ff34f171 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 15:14:11 +0100 Subject: [PATCH 22/31] add write_data argument --- src/zarr/api/synchronous.py | 6 +++++ src/zarr/core/array.py | 47 ++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 77911b3e4e..580130c3e0 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -897,6 +897,7 @@ def create_array( def from_array( data: Array | npt.ArrayLike, store: str | StoreLike, + write_data: bool = True, *, name: str | None = None, chunks: Literal["auto", "keep"] | ChunkCoords = "keep", @@ -922,6 +923,10 @@ def from_array( The array to copy. store : str or Store Store or path to directory in file system or name of zip file for the new array. + write_data : bool, default True + Whether to copy the data from the input array to the new array. + If ``write_data`` is ``False``, the new array will be created with the same metadata as the + input array, but without any data. name : str or None, optional The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. @@ -1014,6 +1019,7 @@ def from_array( zarr.core.array.from_array( data, store, + write_data, name=name, chunks=chunks, shards=shards, diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index b065eda9d1..ca5913c7a3 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3738,6 +3738,7 @@ class ShardsConfigParam(TypedDict): async def from_array( data: Array | npt.ArrayLike, store: str | StoreLike, + write_data: bool = True, *, name: str | None = None, chunks: Literal["auto", "keep"] | ChunkCoords = "keep", @@ -3766,6 +3767,10 @@ async def from_array( name : str or None, optional The name of the array within the store. If ``name`` is ``None``, the array will be located at the root of the store. + write_data : bool, default True + Whether to copy the data from the input array to the new array. + If ``write_data`` is ``False``, the new array will be created with the same metadata as the + input array, but without any data. chunks : ChunkCoords or "auto" or "keep", optional Chunk shape of the array. If not specified, defaults to the chunk shape of the data array. @@ -3893,6 +3898,9 @@ async def from_array( compressors = "auto" if serializer == "keep": serializer = "auto" + if not hasattr(data, "dtype") or not hasattr(data, "shape"): + data = np.array(data) + print(data.shape) new_array = await create_array( store, name=name, @@ -3913,29 +3921,30 @@ async def from_array( overwrite=overwrite, config=config, ) - if isinstance(data, Array): + if write_data: + if isinstance(data, Array): - async def _copy_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: - arr = await _data._async_array.getitem(chunk_coords) - await new_array.setitem(chunk_coords, arr) + async def _copy_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: + arr = await _data._async_array.getitem(chunk_coords) + await new_array.setitem(chunk_coords, arr) - # Stream data from the source array to the new array - await concurrent_map( - [(region, data) for region in new_array._iter_chunk_regions()], - _copy_region, - zarr.core.config.config.get("async.concurrency"), - ) - else: + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in new_array._iter_chunk_regions()], + _copy_region, + zarr.core.config.config.get("async.concurrency"), + ) + else: - async def _copy_region(chunk_coords: ChunkCoords | slice, _data: npt.ArrayLike) -> None: - await new_array.setitem(chunk_coords, _data[chunk_coords]) + async def _copy_region(chunk_coords: ChunkCoords | slice, _data: npt.ArrayLike) -> None: + await new_array.setitem(chunk_coords, _data[chunk_coords]) - # Stream data from the source array to the new array - await concurrent_map( - [(region, data) for region in new_array._iter_chunk_regions()], - _copy_region, - zarr.core.config.config.get("async.concurrency"), - ) + # Stream data from the source array to the new array + await concurrent_map( + [(region, data) for region in new_array._iter_chunk_regions()], + _copy_region, + zarr.core.config.config.get("async.concurrency"), + ) return new_array From 58f05feb7486e98a2a71569336c439067d1644f9 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 15:14:21 +0100 Subject: [PATCH 23/31] improve tests --- tests/test_array.py | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/tests/test_array.py b/tests/test_array.py index cc09a82968..822974ddbe 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -8,6 +8,7 @@ import numcodecs import numpy as np +import numpy.typing as npt import pytest import zarr.api.asynchronous @@ -1299,8 +1300,8 @@ async def test_creation_from_other_zarr_format( @pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) @pytest.mark.parametrize("store2", ["local", "memory", "zip"], indirect=["store2"]) -@pytest.mark.parametrize("src_chunks", [(10, 2), (50, 10)]) -@pytest.mark.parametrize("new_chunks", [(10, 2), (50, 10)]) +@pytest.mark.parametrize("src_chunks", [(40, 10), (11, 50)]) +@pytest.mark.parametrize("new_chunks", [(40, 10), (11, 50)]) async def test_from_array( store: Store, store2: Store, @@ -1340,11 +1341,30 @@ async def test_from_array( assert result.chunks == new_chunks -@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=True) -@pytest.mark.parametrize("chunks", [(10, 2, 3), "keep", "auto"]) -async def test_from_numpy_array( - store: Store, chunks: Literal["auto", "keep"] | tuple[int, int] +@pytest.mark.parametrize("store", ["local"], indirect=True) +@pytest.mark.parametrize("chunks", ["keep", "auto"]) +@pytest.mark.parametrize("write_data", [True, False]) +@pytest.mark.parametrize( + "src", + [ + np.arange(1000).reshape(10, 10, 10), + zarr.ones((10, 10, 10)), + 5, + [1, 2, 3], + [[1, 2, 3], [4, 5, 6]], + ], +) # add other npt.ArrayLike? +async def test_from_array_arraylike( + store: Store, + chunks: Literal["auto", "keep"] | tuple[int, int], + write_data: bool, + src: Array | npt.ArrayLike, ) -> None: - src = np.arange(1000).reshape(10, 10, 10) - result = zarr.from_array(src, store=store, chunks=chunks) - np.testing.assert_array_equal(result[:], src) + fill_value = 42 + result = zarr.from_array( + src, store=store, chunks=chunks, write_data=write_data, fill_value=fill_value + ) + if write_data: + np.testing.assert_array_equal(result[...], src) + else: + np.testing.assert_array_equal(result[...], np.full_like(src, fill_value)) From a4b445683029ed48855bb0a6648643247af185a0 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 16:00:18 +0100 Subject: [PATCH 24/31] improve docstrings and add examples --- src/zarr/api/synchronous.py | 78 +++++++++++++++++++++++++++------ src/zarr/core/array.py | 86 ++++++++++++++++++++++++++++--------- 2 files changed, 130 insertions(+), 34 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 580130c3e0..f38a1921f7 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -859,7 +859,7 @@ def create_array( Examples -------- >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') + >>> store = zarr.storage.MemoryStore() >>> arr = await zarr.create_array( >>> store=store, >>> shape=(100,100), @@ -919,7 +919,7 @@ def from_array( Parameters ---------- - data : Array + data : Array | array-like The array to copy. store : str or Store Store or path to directory in file system or name of zip file for the new array. @@ -932,9 +932,11 @@ def from_array( at the root of the store. chunks : ChunkCoords or "auto" or "keep", optional Chunk shape of the array. - If not specified, defaults to the chunk shape of the data array. - - "auto": Automatically determine the chunk shape based on the array's shape and dtype. - - "keep": Retain the chunk shape of the input array. + Following values are supported: + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the chunk shape. + If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec] or "auto" or "keep", optional @@ -948,9 +950,11 @@ def from_array( For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. - If no ``filters`` are provided, defaults to the filters of the data array. + Following values are supported: + - Iterable[Codec]: List of filters to apply to the array. - "auto": Automatically determine the filters based on the array's dtype. - - "keep": Retain the filters of the input array. + - "keep": Retain the filters of the data array if it is a zarr Array. + If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". compressors : Iterable[Codec] or "auto" or "keep", optional List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -961,16 +965,21 @@ def from_array( For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr format 2. - If no ``compressors`` are provided, defaults to the compressors of the data array. + Following values are supported: + - Iterable[Codec]: List of compressors to apply to the array. - "auto": Automatically determine the compressors based on the array's dtype. - - "keep": Retain the compressors of the input array. + - "keep": Retain the compressors of the input array if it is a zarr Array. + If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, defaults to the serializer of the input array. - - "auto": Automatically determine the serializer based on the array's dtype. - - "keep": Retain the serializer of the input array. + Following values are supported: + - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. + - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. + - "auto": a default serializer will be used. These defaults can be changed by modifying the value of + ``array.v3_default_serializer`` in :mod:`zarr.core.config`. + - "keep": Retain the serializer of the input array if it is a zarr Array. fill_value : Any, optional Fill value for the array. If not specified, defaults to the fill value of the data array. @@ -1007,12 +1016,53 @@ def from_array( Returns ------- - AsyncArray + Array The array. Examples -------- - #TODO + Create an array from an existing Array: + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> store2 = zarr.storage.LocalStore('example.zarr') + >>> arr = zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='int32', + >>> fill_value=0) + >>> arr2 = zarr.from_array(arr, store=store2) + + + Create an array from an existing NumPy array: + >>> import numpy as np + >>> arr3 = zarr.from_array( + >>> np.arange(10000, dtype='i4').reshape(100, 100), + >>> store=zarr.storage.MemoryStore(), + >>> ) + + + Create an array from any array-like object: + >>> arr4 = zarr.from_array( + >>> [[1, 2], [3, 4]], + >>> store= zarr.storage.MemoryStore(), + >>> ) + + >>> arr4[...] + [[1 2] + [3 4]] + + Create an array from an existing Array without copying the data: + >>> arr5 = zarr.from_array( + >>> arr4, + >>> store=zarr.storage.MemoryStore(), + >>> write_data=False, + >>> ) + + >>> arr5[...] + [[0 0] + [0 0]] + """ return Array( sync( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index ca5913c7a3..167f4f7b2c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -837,7 +837,7 @@ async def open( Examples -------- >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') + >>> store = zarr.storage.MemoryStore() >>> async_arr = await AsyncArray.open(store) # doctest: +ELLIPSIS """ @@ -1269,7 +1269,7 @@ async def getitem( Examples -------- >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') + >>> store = zarr.storage.MemoryStore() >>> async_arr = await zarr.api.asynchronous.create_array( ... store=store, ... shape=(100,100), @@ -3760,22 +3760,24 @@ async def from_array( Parameters ---------- - data : Array + data : Array | array-like The array to copy. store : str or Store Store or path to directory in file system or name of zip file for the new array. - name : str or None, optional - The name of the array within the store. If ``name`` is ``None``, the array will be located - at the root of the store. write_data : bool, default True Whether to copy the data from the input array to the new array. If ``write_data`` is ``False``, the new array will be created with the same metadata as the input array, but without any data. + name : str or None, optional + The name of the array within the store. If ``name`` is ``None``, the array will be located + at the root of the store. chunks : ChunkCoords or "auto" or "keep", optional Chunk shape of the array. - If not specified, defaults to the chunk shape of the data array. - - "auto": Automatically determine the chunk shape based on the array's shape and dtype. - - "keep": Retain the chunk shape of the input array. + Following values are supported: + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the chunk shape. + If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. filters : Iterable[Codec] or "auto" or "keep", optional @@ -3789,9 +3791,11 @@ async def from_array( For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the the order if your filters is consistent with the behavior of each filter. - If no ``filters`` are provided, defaults to the filters of the data array. + Following values are supported: + - Iterable[Codec]: List of filters to apply to the array. - "auto": Automatically determine the filters based on the array's dtype. - - "keep": Retain the filters of the input array. + - "keep": Retain the filters of the data array if it is a zarr Array. + If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". compressors : Iterable[Codec] or "auto" or "keep", optional List of compressors to apply to the array. Compressors are applied in order, and after any filters are applied (if any are specified) and the data is serialized into bytes. @@ -3802,16 +3806,21 @@ async def from_array( For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may be provided for Zarr format 2. - If no ``compressors`` are provided, defaults to the compressors of the data array. + Following values are supported: + - Iterable[Codec]: List of compressors to apply to the array. - "auto": Automatically determine the compressors based on the array's dtype. - - "keep": Retain the compressors of the input array. + - "keep": Retain the compressors of the input array if it is a zarr Array. + If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. - If no ``serializer`` is provided, defaults to the serializer of the input array. - - "auto": Automatically determine the serializer based on the array's dtype. - - "keep": Retain the serializer of the input array. + Following values are supported: + - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. + - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. + - "auto": a default serializer will be used. These defaults can be changed by modifying the value of + ``array.v3_default_serializer`` in :mod:`zarr.core.config`. + - "keep": Retain the serializer of the input array if it is a zarr Array. fill_value : Any, optional Fill value for the array. If not specified, defaults to the fill value of the data array. @@ -3853,9 +3862,47 @@ async def from_array( Examples -------- - #TODO + Create an array from an existing Array: + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> store2 = zarr.storage.LocalStore('example.zarr') + >>> arr = zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='int32', + >>> fill_value=0) + >>> arr2 = await zarr.api.asynchronous.from_array(arr, store=store2) + + + Create an array from an existing NumPy array: + >>> arr3 = await zarr.api.asynchronous.from_array( + >>> np.arange(10000, dtype='i4').reshape(100, 100), + >>> store=zarr.storage.MemoryStore(), + >>> ) + + + Create an array from any array-like object: + >>> arr4 = await zarr.api.asynchronous.from_array( + >>> [[1, 2], [3, 4]], + >>> store=zarr.storage.MemoryStore(), + >>> ) + + >>> await arr4.getitem(...) + array([[1, 2], + [3, 4]]) + + Create an array from an existing Array without copying the data: + >>> arr5 = await zarr.api.asynchronous.from_array( + >>> Array(arr4), + >>> store=zarr.storage.MemoryStore(), + >>> write_data=False, + >>> ) + + >>> await arr5.getitem(...) + array([[0, 0], + [0, 0]]) """ - if isinstance(data, Array): if chunks == "keep": chunks = data.chunks @@ -3900,7 +3947,6 @@ async def from_array( serializer = "auto" if not hasattr(data, "dtype") or not hasattr(data, "shape"): data = np.array(data) - print(data.shape) new_array = await create_array( store, name=name, @@ -4064,7 +4110,7 @@ async def create_array( Examples -------- >>> import zarr - >>> store = zarr.storage.MemoryStore(mode='w') + >>> store = zarr.storage.MemoryStore() >>> async_arr = await zarr.api.asynchronous.create_array( >>> store=store, >>> shape=(100,100), From fc69b6707a5818a02c54e29851cf5e3f84cb148a Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 16:28:15 +0100 Subject: [PATCH 25/31] fix mypy and readthedocs --- src/zarr/api/synchronous.py | 14 ++++++++++---- src/zarr/core/array.py | 10 ++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index f38a1921f7..332aa11405 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -933,9 +933,11 @@ def from_array( chunks : ChunkCoords or "auto" or "keep", optional Chunk shape of the array. Following values are supported: - - "auto": Automatically determine the chunk shape based on the array's shape and dtype. - - "keep": Retain the chunk shape of the data array if it is a zarr Array. - - ChunkCoords: A tuple of integers representing the chunk shape. + + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the chunk shape. + If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. @@ -951,9 +953,11 @@ def from_array( the order if your filters is consistent with the behavior of each filter. Following values are supported: + - Iterable[Codec]: List of filters to apply to the array. - "auto": Automatically determine the filters based on the array's dtype. - "keep": Retain the filters of the data array if it is a zarr Array. + If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". compressors : Iterable[Codec] or "auto" or "keep", optional List of compressors to apply to the array. Compressors are applied in order, and after any @@ -966,15 +970,18 @@ def from_array( be provided for Zarr format 2. Following values are supported: + - Iterable[Codec]: List of compressors to apply to the array. - "auto": Automatically determine the compressors based on the array's dtype. - "keep": Retain the compressors of the input array if it is a zarr Array. + If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. Following values are supported: + - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. - "auto": a default serializer will be used. These defaults can be changed by modifying the value of @@ -1062,7 +1069,6 @@ def from_array( >>> arr5[...] [[0 0] [0 0]] - """ return Array( sync( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 167f4f7b2c..8bfb0ef239 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3970,25 +3970,27 @@ async def from_array( if write_data: if isinstance(data, Array): - async def _copy_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: + async def _copy_array_region(chunk_coords: ChunkCoords | slice, _data: Array) -> None: arr = await _data._async_array.getitem(chunk_coords) await new_array.setitem(chunk_coords, arr) # Stream data from the source array to the new array await concurrent_map( [(region, data) for region in new_array._iter_chunk_regions()], - _copy_region, + _copy_array_region, zarr.core.config.config.get("async.concurrency"), ) else: - async def _copy_region(chunk_coords: ChunkCoords | slice, _data: npt.ArrayLike) -> None: + async def _copy_arraylike_region( + chunk_coords: ChunkCoords | slice, _data: npt.ArrayLike + ) -> None: await new_array.setitem(chunk_coords, _data[chunk_coords]) # Stream data from the source array to the new array await concurrent_map( [(region, data) for region in new_array._iter_chunk_regions()], - _copy_region, + _copy_arraylike_region, zarr.core.config.config.get("async.concurrency"), ) return new_array From 4f3e1560325d9df2ffd5ce6717d0a8813e9d35a0 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 16:43:47 +0100 Subject: [PATCH 26/31] fix mypy and readthedocs --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 332aa11405..e832ac9327 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -915,7 +915,7 @@ def from_array( overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, ) -> Array: - """Create an array from an existing array. + """Create an array from an existing array or array-like. Parameters ---------- diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 8bfb0ef239..569d0ca59e 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3756,7 +3756,7 @@ async def from_array( overwrite: bool = False, config: ArrayConfig | ArrayConfigLike | None = None, ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: - """Create an array from an existing array. + """Create an array from an existing array or array-like. Parameters ---------- @@ -3774,9 +3774,11 @@ async def from_array( chunks : ChunkCoords or "auto" or "keep", optional Chunk shape of the array. Following values are supported: - - "auto": Automatically determine the chunk shape based on the array's shape and dtype. - - "keep": Retain the chunk shape of the data array if it is a zarr Array. - - ChunkCoords: A tuple of integers representing the chunk shape. + + - "auto": Automatically determine the chunk shape based on the array's shape and dtype. + - "keep": Retain the chunk shape of the data array if it is a zarr Array. + - ChunkCoords: A tuple of integers representing the chunk shape. + If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto". shards : ChunkCoords, optional Shard shape of the array. The default value of ``None`` results in no sharding at all. @@ -3792,9 +3794,11 @@ async def from_array( the order if your filters is consistent with the behavior of each filter. Following values are supported: + - Iterable[Codec]: List of filters to apply to the array. - "auto": Automatically determine the filters based on the array's dtype. - "keep": Retain the filters of the data array if it is a zarr Array. + If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". compressors : Iterable[Codec] or "auto" or "keep", optional List of compressors to apply to the array. Compressors are applied in order, and after any @@ -3807,20 +3811,24 @@ async def from_array( be provided for Zarr format 2. Following values are supported: + - Iterable[Codec]: List of compressors to apply to the array. - "auto": Automatically determine the compressors based on the array's dtype. - "keep": Retain the compressors of the input array if it is a zarr Array. + If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto". serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional Array-to-bytes codec to use for encoding the array data. Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion. Following values are supported: + - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. - "auto": a default serializer will be used. These defaults can be changed by modifying the value of ``array.v3_default_serializer`` in :mod:`zarr.core.config`. - "keep": Retain the serializer of the input array if it is a zarr Array. + fill_value : Any, optional Fill value for the array. If not specified, defaults to the fill value of the data array. @@ -3983,7 +3991,7 @@ async def _copy_array_region(chunk_coords: ChunkCoords | slice, _data: Array) -> else: async def _copy_arraylike_region( - chunk_coords: ChunkCoords | slice, _data: npt.ArrayLike + chunk_coords: ChunkCoords | slice, _data: NDArrayLike ) -> None: await new_array.setitem(chunk_coords, _data[chunk_coords]) From a2373ad96b5abbfdae991dfa81e3007431e233c2 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 19:28:46 +0100 Subject: [PATCH 27/31] fix mypy and readthedocs --- src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 6 ++---- tests/test_array.py | 15 ++++++++++----- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index e832ac9327..934e22a38a 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -985,7 +985,7 @@ def from_array( - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. - "auto": a default serializer will be used. These defaults can be changed by modifying the value of - ``array.v3_default_serializer`` in :mod:`zarr.core.config`. + `array.v3_default_serializer` in :mod:`zarr.core.config`. - "keep": Retain the serializer of the input array if it is a zarr Array. fill_value : Any, optional Fill value for the array. diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index d74ec4870e..a008556f64 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3834,7 +3834,7 @@ async def from_array( - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. - "auto": a default serializer will be used. These defaults can be changed by modifying the value of - ``array.v3_default_serializer`` in :mod:`zarr.core.config`. + `array.v3_default_serializer`` in :mod:`zarr.core.config`. - "keep": Retain the serializer of the input array if it is a zarr Array. fill_value : Any, optional @@ -3998,9 +3998,7 @@ async def _copy_array_region(chunk_coords: ChunkCoords | slice, _data: Array) -> ) else: - async def _copy_arraylike_region( - chunk_coords: ChunkCoords | slice, _data: NDArrayLike - ) -> None: + async def _copy_arraylike_region(chunk_coords: slice, _data: NDArrayLike) -> None: await new_array.setitem(chunk_coords, _data[chunk_coords]) # Stream data from the source array to the new array diff --git a/tests/test_array.py b/tests/test_array.py index 822974ddbe..c52e14d4cc 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1276,14 +1276,19 @@ async def test_creation_from_other_zarr_format( src_format: ZarrFormat, new_format: ZarrFormat, ) -> None: - kwargs: dict[str, tuple[Literal["default"], Literal[".", "/"]] | Literal[".", "/"]] = {} - # set dimension_separator to non default if src_format == 2: - kwargs["dimension_separator"] = "/" + src = zarr.create( + (50, 50), chunks=(10, 10), store=store, zarr_format=src_format, dimension_separator="/" + ) else: - kwargs["chunk_key_encoding"] = ("default", ".") + src = zarr.create( + (50, 50), + chunks=(10, 10), + store=store, + zarr_format=src_format, + chunk_key_encoding=("default", "."), + ) - src = zarr.create((50, 50), chunks=(10, 10), store=store, zarr_format=src_format, **kwargs) src[:] = np.arange(50 * 50).reshape((50, 50)) result = zarr.from_array( src, From 2706f179935efff8df9d0a9e3e453d5584dba1b8 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 19:59:55 +0100 Subject: [PATCH 28/31] fix mypy and readthedocs --- src/zarr/api/synchronous.py | 3 ++- tests/test_array.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 934e22a38a..126f1b3743 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -987,6 +987,7 @@ def from_array( - "auto": a default serializer will be used. These defaults can be changed by modifying the value of `array.v3_default_serializer` in :mod:`zarr.core.config`. - "keep": Retain the serializer of the input array if it is a zarr Array. + fill_value : Any, optional Fill value for the array. If not specified, defaults to the fill value of the data array. @@ -1068,7 +1069,7 @@ def from_array( >>> arr5[...] [[0 0] - [0 0]] + [0 0]] """ return Array( sync( diff --git a/tests/test_array.py b/tests/test_array.py index c52e14d4cc..692f39ea89 100644 --- a/tests/test_array.py +++ b/tests/test_array.py @@ -1370,6 +1370,6 @@ async def test_from_array_arraylike( src, store=store, chunks=chunks, write_data=write_data, fill_value=fill_value ) if write_data: - np.testing.assert_array_equal(result[...], src) + np.testing.assert_array_equal(result[...], np.array(src)) else: np.testing.assert_array_equal(result[...], np.full_like(src, fill_value)) From d5ccda112b1b18aeae0397069d01193e7c35046f Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 20:07:44 +0100 Subject: [PATCH 29/31] fix readthedocs ERROR: Unexpected indentation --- src/zarr/api/synchronous.py | 6 ++---- src/zarr/core/array.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 126f1b3743..8b725e4ac0 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -1057,8 +1057,7 @@ def from_array( >>> ) >>> arr4[...] - [[1 2] - [3 4]] + array([[1, 2],[3, 4]]) Create an array from an existing Array without copying the data: >>> arr5 = zarr.from_array( @@ -1068,8 +1067,7 @@ def from_array( >>> ) >>> arr5[...] - [[0 0] - [0 0]] + array([[0, 0],[0, 0]]) """ return Array( sync( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a008556f64..19c6130a99 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3905,8 +3905,7 @@ async def from_array( >>> ) >>> await arr4.getitem(...) - array([[1, 2], - [3, 4]]) + array([[1, 2],[3, 4]]) Create an array from an existing Array without copying the data: >>> arr5 = await zarr.api.asynchronous.from_array( @@ -3916,8 +3915,7 @@ async def from_array( >>> ) >>> await arr5.getitem(...) - array([[0, 0], - [0, 0]]) + array([[0, 0],[0, 0]]) """ if isinstance(data, Array): if chunks == "keep": From e3d691d0038aea4e1fccc25b5e96cf0953f8e8fc Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 20:41:26 +0100 Subject: [PATCH 30/31] add release notes --- docs/release-notes.rst | 2 ++ src/zarr/api/synchronous.py | 2 +- src/zarr/core/array.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/release-notes.rst b/docs/release-notes.rst index ecd413510b..5e169dab98 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -7,6 +7,8 @@ Unreleased New features ~~~~~~~~~~~~ +* Implement ``zarr.from_array`` using concurrent streaming (:issue:`2622`). + Bug fixes ~~~~~~~~~ * Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`). diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 8b725e4ac0..fe43e7094f 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -985,7 +985,7 @@ def from_array( - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. - "auto": a default serializer will be used. These defaults can be changed by modifying the value of - `array.v3_default_serializer` in :mod:`zarr.core.config`. + ``array.v3_default_serializer`` in :mod:`zarr.core.config`. - "keep": Retain the serializer of the input array if it is a zarr Array. fill_value : Any, optional diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 19c6130a99..a5ec89117c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3834,7 +3834,7 @@ async def from_array( - dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``. - ArrayBytesCodec: An instance of ``ArrayBytesCodec``. - "auto": a default serializer will be used. These defaults can be changed by modifying the value of - `array.v3_default_serializer`` in :mod:`zarr.core.config`. + ``array.v3_default_serializer`` in :mod:`zarr.core.config`. - "keep": Retain the serializer of the input array if it is a zarr Array. fill_value : Any, optional From 29074fdb2774e449fc242df3268ef6ab2414e917 Mon Sep 17 00:00:00 2001 From: brokkoli71 Date: Wed, 15 Jan 2025 20:52:04 +0100 Subject: [PATCH 31/31] format docstring examples --- src/zarr/api/synchronous.py | 82 +++++++++++++++++++------------------ src/zarr/core/array.py | 80 +++++++++++++++++++----------------- 2 files changed, 85 insertions(+), 77 deletions(-) diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index fe43e7094f..6ad0124ad8 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -1029,45 +1029,49 @@ def from_array( Examples -------- - Create an array from an existing Array: - >>> import zarr - >>> store = zarr.storage.MemoryStore() - >>> store2 = zarr.storage.LocalStore('example.zarr') - >>> arr = zarr.create_array( - >>> store=store, - >>> shape=(100,100), - >>> chunks=(10,10), - >>> dtype='int32', - >>> fill_value=0) - >>> arr2 = zarr.from_array(arr, store=store2) - - - Create an array from an existing NumPy array: - >>> import numpy as np - >>> arr3 = zarr.from_array( - >>> np.arange(10000, dtype='i4').reshape(100, 100), - >>> store=zarr.storage.MemoryStore(), - >>> ) - - - Create an array from any array-like object: - >>> arr4 = zarr.from_array( - >>> [[1, 2], [3, 4]], - >>> store= zarr.storage.MemoryStore(), - >>> ) - - >>> arr4[...] - array([[1, 2],[3, 4]]) - - Create an array from an existing Array without copying the data: - >>> arr5 = zarr.from_array( - >>> arr4, - >>> store=zarr.storage.MemoryStore(), - >>> write_data=False, - >>> ) - - >>> arr5[...] - array([[0, 0],[0, 0]]) + Create an array from an existing Array:: + + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> store2 = zarr.storage.LocalStore('example.zarr') + >>> arr = zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='int32', + >>> fill_value=0) + >>> arr2 = zarr.from_array(arr, store=store2) + + + Create an array from an existing NumPy array:: + + >>> import numpy as np + >>> arr3 = zarr.from_array( + >>> np.arange(10000, dtype='i4').reshape(100, 100), + >>> store=zarr.storage.MemoryStore(), + >>> ) + + + Create an array from any array-like object:: + + >>> arr4 = zarr.from_array( + >>> [[1, 2], [3, 4]], + >>> store= zarr.storage.MemoryStore(), + >>> ) + + >>> arr4[...] + array([[1, 2],[3, 4]]) + + Create an array from an existing Array without copying the data:: + + >>> arr5 = zarr.from_array( + >>> arr4, + >>> store=zarr.storage.MemoryStore(), + >>> write_data=False, + >>> ) + + >>> arr5[...] + array([[0, 0],[0, 0]]) """ return Array( sync( diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index a5ec89117c..eb483a58e5 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -3878,44 +3878,48 @@ async def from_array( Examples -------- - Create an array from an existing Array: - >>> import zarr - >>> store = zarr.storage.MemoryStore() - >>> store2 = zarr.storage.LocalStore('example.zarr') - >>> arr = zarr.create_array( - >>> store=store, - >>> shape=(100,100), - >>> chunks=(10,10), - >>> dtype='int32', - >>> fill_value=0) - >>> arr2 = await zarr.api.asynchronous.from_array(arr, store=store2) - - - Create an array from an existing NumPy array: - >>> arr3 = await zarr.api.asynchronous.from_array( - >>> np.arange(10000, dtype='i4').reshape(100, 100), - >>> store=zarr.storage.MemoryStore(), - >>> ) - - - Create an array from any array-like object: - >>> arr4 = await zarr.api.asynchronous.from_array( - >>> [[1, 2], [3, 4]], - >>> store=zarr.storage.MemoryStore(), - >>> ) - - >>> await arr4.getitem(...) - array([[1, 2],[3, 4]]) - - Create an array from an existing Array without copying the data: - >>> arr5 = await zarr.api.asynchronous.from_array( - >>> Array(arr4), - >>> store=zarr.storage.MemoryStore(), - >>> write_data=False, - >>> ) - - >>> await arr5.getitem(...) - array([[0, 0],[0, 0]]) + Create an array from an existing Array:: + + >>> import zarr + >>> store = zarr.storage.MemoryStore() + >>> store2 = zarr.storage.LocalStore('example.zarr') + >>> arr = zarr.create_array( + >>> store=store, + >>> shape=(100,100), + >>> chunks=(10,10), + >>> dtype='int32', + >>> fill_value=0) + >>> arr2 = await zarr.api.asynchronous.from_array(arr, store=store2) + + + Create an array from an existing NumPy array:: + + >>> arr3 = await zarr.api.asynchronous.from_array( + >>> np.arange(10000, dtype='i4').reshape(100, 100), + >>> store=zarr.storage.MemoryStore(), + >>> ) + + + Create an array from any array-like object:: + + >>> arr4 = await zarr.api.asynchronous.from_array( + >>> [[1, 2], [3, 4]], + >>> store=zarr.storage.MemoryStore(), + >>> ) + + >>> await arr4.getitem(...) + array([[1, 2],[3, 4]]) + + Create an array from an existing Array without copying the data:: + + >>> arr5 = await zarr.api.asynchronous.from_array( + >>> Array(arr4), + >>> store=zarr.storage.MemoryStore(), + >>> write_data=False, + >>> ) + + >>> await arr5.getitem(...) + array([[0, 0],[0, 0]]) """ if isinstance(data, Array): if chunks == "keep":