Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

zarr.array from from an existing zarr.Array #2622

Open
wants to merge 37 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
5483956
add creation from other zarr
brokkoli71 Jan 2, 2025
9a32d1f
remove duplicated tests
brokkoli71 Jan 2, 2025
2c19072
improve test
brokkoli71 Jan 2, 2025
91152be
test_iter_grid for non-squares
brokkoli71 Jan 2, 2025
3c5ec3f
concurrent streaming for equal chunk sizes
brokkoli71 Jan 8, 2025
a675475
Merge branch 'main' into creation-from-other-zarr
brokkoli71 Jan 8, 2025
79a45b1
fix merge
brokkoli71 Jan 8, 2025
da2f03f
fix mypy
brokkoli71 Jan 8, 2025
a7553a7
Merge branch 'main' into creation-from-other-zarr
brokkoli71 Jan 8, 2025
7728d7f
fix mypy
brokkoli71 Jan 8, 2025
2df18a0
fix test_iter_grid
brokkoli71 Jan 8, 2025
03e2500
extract to zarr.from_array
brokkoli71 Jan 8, 2025
f6ae2f8
fix mypy
brokkoli71 Jan 8, 2025
36146e5
fix mypy
brokkoli71 Jan 8, 2025
085efe9
format
brokkoli71 Jan 9, 2025
353a477
Merge branch 'main' into creation-from-other-zarr
brokkoli71 Jan 9, 2025
93ed8d6
fix test_creation_from_other_zarr_format
brokkoli71 Jan 9, 2025
e607105
distinguish between keep and auto for from_array arguments
brokkoli71 Jan 15, 2025
7eb6988
partition concurrency along new_array chunks
brokkoli71 Jan 15, 2025
c7393a4
fix mypy
brokkoli71 Jan 15, 2025
543099a
improve test_creation_from_other_zarr_format
brokkoli71 Jan 15, 2025
73843dc
add typing in test
brokkoli71 Jan 15, 2025
0f0f812
Update src/zarr/core/array.py
brokkoli71 Jan 15, 2025
97b7b9b
Merge remote-tracking branch 'origin/creation-from-other-zarr' into c…
brokkoli71 Jan 15, 2025
021ca95
add from_array with npt.ArrayLike
brokkoli71 Jan 15, 2025
092a1e0
add write_data argument
brokkoli71 Jan 15, 2025
58f05fe
improve tests
brokkoli71 Jan 15, 2025
a4b4456
improve docstrings and add examples
brokkoli71 Jan 15, 2025
fc69b67
fix mypy and readthedocs
brokkoli71 Jan 15, 2025
4f3e156
fix mypy and readthedocs
brokkoli71 Jan 15, 2025
addb785
Merge branch 'main' into creation-from-other-zarr
brokkoli71 Jan 15, 2025
a2373ad
fix mypy and readthedocs
brokkoli71 Jan 15, 2025
2706f17
fix mypy and readthedocs
brokkoli71 Jan 15, 2025
d5ccda1
fix readthedocs ERROR: Unexpected indentation
brokkoli71 Jan 15, 2025
e3d691d
add release notes
brokkoli71 Jan 15, 2025
29074fd
format docstring examples
brokkoli71 Jan 15, 2025
4ad7a5a
Merge branch 'main' into creation-from-other-zarr
brokkoli71 Jan 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/release-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ Release notes
3.0.1 (Jan. 17, 2025)
---------------------

* Implement ``zarr.from_array`` using concurrent streaming (:issue:`2622`).

Bug fixes
~~~~~~~~~
* Fixes ``order`` argument for Zarr format 2 arrays (:issue:`2679`).
Expand Down
2 changes: 2 additions & 0 deletions src/zarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
create_group,
empty,
empty_like,
from_array,
full,
full_like,
group,
Expand Down Expand Up @@ -52,6 +53,7 @@
"create_group",
"empty",
"empty_like",
"from_array",
"full",
"full_like",
"group",
Expand Down
10 changes: 7 additions & 3 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy.typing as npt
from typing_extensions import deprecated

from zarr.core.array import Array, AsyncArray, create_array, get_array_metadata
from zarr.core.array import Array, AsyncArray, create_array, from_array, get_array_metadata
from zarr.core.array_spec import ArrayConfig, ArrayConfigLike
from zarr.core.buffer import NDArrayLike
from zarr.core.common import (
Expand Down Expand Up @@ -50,6 +50,7 @@
"create_array",
"empty",
"empty_like",
"from_array",
"full",
"full_like",
"group",
Expand Down Expand Up @@ -527,7 +528,7 @@ async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None =


async def array(
data: npt.ArrayLike, **kwargs: Any
data: npt.ArrayLike | Array, **kwargs: Any
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
"""Create an array filled with `data`.

Expand All @@ -544,13 +545,16 @@ async def array(
The new array.
"""

if isinstance(data, Array):
return await from_array(data, **kwargs)

# ensure data is array-like
if not hasattr(data, "shape") or not hasattr(data, "dtype"):
data = np.asanyarray(data)

# setup dtype
kw_dtype = kwargs.get("dtype")
if kw_dtype is None:
if kw_dtype is None and hasattr(data, "dtype"):
kwargs["dtype"] = data.dtype
else:
kwargs["dtype"] = kw_dtype
Expand Down
210 changes: 208 additions & 2 deletions src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"create_array",
"empty",
"empty_like",
"from_array",
"full",
"full_like",
"group",
Expand Down Expand Up @@ -357,7 +358,7 @@ def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> An


# TODO: add type annotations for kwargs
def array(data: npt.ArrayLike, **kwargs: Any) -> Array:
def array(data: npt.ArrayLike | Array, **kwargs: Any) -> Array:
"""Create an array filled with `data`.

Parameters
Expand Down Expand Up @@ -864,7 +865,7 @@ def create_array(
Examples
--------
>>> import zarr
>>> store = zarr.storage.MemoryStore(mode='w')
>>> store = zarr.storage.MemoryStore()
>>> arr = await zarr.create_array(
>>> store=store,
>>> shape=(100,100),
Expand Down Expand Up @@ -900,6 +901,211 @@ def create_array(
)


def from_array(
data: Array | npt.ArrayLike,
store: str | StoreLike,
write_data: bool = True,
*,
name: str | None = None,
chunks: Literal["auto", "keep"] | ChunkCoords = "keep",
shards: ShardsLike | None = None,
filters: FiltersLike | Literal["keep"] = "keep",
compressors: CompressorsLike | Literal["keep"] = "keep",
serializer: SerializerLike | Literal["keep"] = "keep",
fill_value: Any | None = None,
order: MemoryOrder | None = None,
zarr_format: ZarrFormat | None = 3,
attributes: dict[str, JSON] | None = None,
chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None,
dimension_names: Iterable[str] | None = None,
storage_options: dict[str, Any] | None = None,
overwrite: bool = False,
config: ArrayConfig | ArrayConfigLike | None = None,
) -> Array:
"""Create an array from an existing array or array-like.

Parameters
----------
data : Array | array-like
The array to copy.
store : str or Store
Store or path to directory in file system or name of zip file for the new array.
write_data : bool, default True
Whether to copy the data from the input array to the new array.
If ``write_data`` is ``False``, the new array will be created with the same metadata as the
input array, but without any data.
name : str or None, optional
The name of the array within the store. If ``name`` is ``None``, the array will be located
at the root of the store.
chunks : ChunkCoords or "auto" or "keep", optional
Chunk shape of the array.
Following values are supported:

- "auto": Automatically determine the chunk shape based on the array's shape and dtype.
- "keep": Retain the chunk shape of the data array if it is a zarr Array.
- ChunkCoords: A tuple of integers representing the chunk shape.

If not specified, defaults to "keep" if data is a zarr Array, otherwise "auto".
shards : ChunkCoords, optional
Shard shape of the array. The default value of ``None`` results in no sharding at all.
filters : Iterable[Codec] or "auto" or "keep", optional
Iterable of filters to apply to each chunk of the array, in order, before serializing that
chunk to bytes.

For Zarr format 3, a "filter" is a codec that takes an array and returns an array,
and these values must be instances of ``ArrayArrayCodec``, or dict representations
of ``ArrayArrayCodec``.

For Zarr format 2, a "filter" can be any numcodecs codec; you should ensure that the
the order if your filters is consistent with the behavior of each filter.

Following values are supported:

- Iterable[Codec]: List of filters to apply to the array.
- "auto": Automatically determine the filters based on the array's dtype.
- "keep": Retain the filters of the data array if it is a zarr Array.

If no ``filters`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto".
compressors : Iterable[Codec] or "auto" or "keep", optional
List of compressors to apply to the array. Compressors are applied in order, and after any
filters are applied (if any are specified) and the data is serialized into bytes.

For Zarr format 3, a "compressor" is a codec that takes a bytestream, and
returns another bytestream. Multiple compressors my be provided for Zarr format 3.

For Zarr format 2, a "compressor" can be any numcodecs codec. Only a single compressor may
be provided for Zarr format 2.

Following values are supported:

- Iterable[Codec]: List of compressors to apply to the array.
- "auto": Automatically determine the compressors based on the array's dtype.
- "keep": Retain the compressors of the input array if it is a zarr Array.

If no ``compressors`` are provided, defaults to "keep" if data is a zarr Array, otherwise "auto".
serializer : dict[str, JSON] | ArrayBytesCodec or "auto" or "keep", optional
Array-to-bytes codec to use for encoding the array data.
Zarr format 3 only. Zarr format 2 arrays use implicit array-to-bytes conversion.

Following values are supported:

- dict[str, JSON]: A dict representation of an ``ArrayBytesCodec``.
- ArrayBytesCodec: An instance of ``ArrayBytesCodec``.
- "auto": a default serializer will be used. These defaults can be changed by modifying the value of
``array.v3_default_serializer`` in :mod:`zarr.core.config`.
- "keep": Retain the serializer of the input array if it is a zarr Array.

fill_value : Any, optional
Fill value for the array.
If not specified, defaults to the fill value of the data array.
order : {"C", "F"}, optional
The memory of the array (default is "C").
For Zarr format 2, this parameter sets the memory order of the array.
For Zarr format 3, this parameter is deprecated, because memory order
is a runtime parameter for Zarr format 3 arrays. The recommended way to specify the memory
order for Zarr format 3 arrays is via the ``config`` parameter, e.g. ``{'config': 'C'}``.
If not specified, defaults to the memory order of the data array.
zarr_format : {2, 3}, optional
The zarr format to use when saving.
If not specified, defaults to the zarr format of the data array.
attributes : dict, optional
Attributes for the array.
If not specified, defaults to the attributes of the data array.
chunk_key_encoding : ChunkKeyEncoding, optional
A specification of how the chunk keys are represented in storage.
For Zarr format 3, the default is ``{"name": "default", "separator": "/"}}``.
For Zarr format 2, the default is ``{"name": "v2", "separator": "."}}``.
If not specified and the data array has the same zarr format as the target array,
the chunk key encoding of the data array is used.
dimension_names : Iterable[str], optional
The names of the dimensions (default is None).
Zarr format 3 only. Zarr format 2 arrays should not use this parameter.
If not specified, defaults to the dimension names of the data array.
storage_options : dict, optional
If using an fsspec URL to create the store, these will be passed to the backend implementation.
Ignored otherwise.
overwrite : bool, default False
Whether to overwrite an array with the same name in the store, if one exists.
config : ArrayConfig or ArrayConfigLike, optional
Runtime configuration for the array.

Returns
-------
Array
The array.

Examples
--------
Create an array from an existing Array::

>>> import zarr
>>> store = zarr.storage.MemoryStore()
>>> store2 = zarr.storage.LocalStore('example.zarr')
>>> arr = zarr.create_array(
>>> store=store,
>>> shape=(100,100),
>>> chunks=(10,10),
>>> dtype='int32',
>>> fill_value=0)
>>> arr2 = zarr.from_array(arr, store=store2)
<Array file://example.zarr shape=(100, 100) dtype=int32>

Create an array from an existing NumPy array::

>>> import numpy as np
>>> arr3 = zarr.from_array(
>>> np.arange(10000, dtype='i4').reshape(100, 100),
>>> store=zarr.storage.MemoryStore(),
>>> )
<Array memory://125477403529984 shape=(100, 100) dtype=int32>

Create an array from any array-like object::

>>> arr4 = zarr.from_array(
>>> [[1, 2], [3, 4]],
>>> store= zarr.storage.MemoryStore(),
>>> )
<Array memory://125477392154368 shape=(2, 2) dtype=int64>
>>> arr4[...]
array([[1, 2],[3, 4]])

Create an array from an existing Array without copying the data::

>>> arr5 = zarr.from_array(
>>> arr4,
>>> store=zarr.storage.MemoryStore(),
>>> write_data=False,
>>> )
<Array memory://140678602965568 shape=(2, 2) dtype=int64>
>>> arr5[...]
array([[0, 0],[0, 0]])
"""
return Array(
sync(
zarr.core.array.from_array(
data,
store,
write_data,
name=name,
chunks=chunks,
shards=shards,
filters=filters,
compressors=compressors,
serializer=serializer,
fill_value=fill_value,
order=order,
zarr_format=zarr_format,
attributes=attributes,
chunk_key_encoding=chunk_key_encoding,
dimension_names=dimension_names,
storage_options=storage_options,
overwrite=overwrite,
config=config,
)
)
)


# TODO: add type annotations for kwargs
def empty(shape: ChunkCoords, **kwargs: Any) -> Array:
"""Create an empty array.
Expand Down
Loading