Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repack Nwb Files #1003

Draft
wants to merge 40 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
7304229
setup temp conversion script
pauladkisson Aug 12, 2024
4cc2a06
added from_existing_neurodata_object for hdf5
pauladkisson Aug 12, 2024
c33dfbf
added get_existing_dataset_io_configurations
pauladkisson Aug 12, 2024
80c1fba
added support for chunk_shape=None
pauladkisson Aug 13, 2024
7ee6fc6
added from_existing_nwbfile to HDF5BackendConfiguration
pauladkisson Aug 13, 2024
dacdeea
added get_existing_backend_configuration
pauladkisson Aug 13, 2024
dae04bf
added repack_nwbfile
pauladkisson Aug 13, 2024
4ac6e33
fixed bug with export options and hdmf.container.Container.set_data_io
pauladkisson Aug 14, 2024
ce267fb
refactored from_ methods
pauladkisson Aug 14, 2024
49f4262
template and changes optional
pauladkisson Aug 14, 2024
d93a5c5
added image series test
pauladkisson Aug 15, 2024
ab8b22f
Merge branch 'main' into repack
bendichter Aug 15, 2024
934bb3a
Merge branch 'main' into repack
pauladkisson Aug 15, 2024
1ad69ca
added initial test
pauladkisson Aug 15, 2024
04fb89c
updated signature to use file_path
pauladkisson Aug 16, 2024
6dab477
added test for trials table (fails)
pauladkisson Aug 16, 2024
e6d31a6
moved backend_configuration_changes to top of the fn
pauladkisson Aug 16, 2024
7252449
consolidated configure_and_export_nwbfile into configure_and_write_nw…
pauladkisson Aug 16, 2024
2ef5c44
parameterized for use_default_backend_configuration
pauladkisson Aug 16, 2024
80eb598
optional dci
pauladkisson Aug 19, 2024
433f8c9
added test for backend config changes
pauladkisson Aug 19, 2024
dd906ac
updated api to use boolean use_default flag instead of mode=existing
pauladkisson Aug 19, 2024
668cacc
added test for get_existing_backend_configuration
pauladkisson Aug 19, 2024
7796197
removed image_series test
pauladkisson Aug 19, 2024
b8a788c
added compressed trials table column
pauladkisson Aug 19, 2024
f631fb4
added test for get_existing_dataset_io.py
pauladkisson Aug 20, 2024
b089eb3
Merge branch 'main' into repack
pauladkisson Aug 20, 2024
c464764
added docstrings
pauladkisson Aug 20, 2024
1cf3629
used BACKEND_NWB_IO dict
pauladkisson Aug 20, 2024
481529f
added ZarrDatsetIOConfiguration.from_neurodata_object
pauladkisson Aug 20, 2024
1e6b119
Merge branch 'main' into repack
bendichter Aug 20, 2024
9f02b61
removed unnecessary indent
pauladkisson Aug 21, 2024
9ee146f
estimate buffer shape
pauladkisson Aug 21, 2024
ee7ec52
updated temp_test
pauladkisson Aug 21, 2024
a2145a1
added zarr to dataset_io tests
pauladkisson Aug 22, 2024
5785af0
added zarr to backend_configuration tests
pauladkisson Aug 22, 2024
b07c002
added zarr to repack_nwbfile tests
pauladkisson Aug 22, 2024
a8c57e8
Merge branch 'main' into repack
pauladkisson Jan 25, 2025
ad641ef
fixed merge
pauladkisson Jan 25, 2025
8d8689f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/neuroconv/tools/nwb_helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@

from ._backend_configuration import (
BACKEND_CONFIGURATIONS,
BACKEND_NWB_IO,
get_default_backend_configuration,
get_existing_backend_configuration,
)
from ._configuration_models import DATASET_IO_CONFIGURATIONS
from ._configuration_models._base_backend import BackendConfiguration
Expand All @@ -21,15 +23,15 @@
ZarrDatasetIOConfiguration,
)
from ._configure_backend import configure_backend
from ._dataset_configuration import get_default_dataset_io_configurations
from ._dataset_configuration import get_default_dataset_io_configurations, get_existing_dataset_io_configurations
from ._metadata_and_file_helpers import (
BACKEND_NWB_IO,
add_device_from_metadata,
configure_and_write_nwbfile,
get_default_nwbfile_metadata,
get_module,
make_nwbfile_from_metadata,
make_or_load_nwbfile,
repack_nwbfile,
)

__all__ = [
Expand All @@ -46,6 +48,8 @@
"ZarrDatasetIOConfiguration",
"get_default_backend_configuration",
"get_default_dataset_io_configurations",
"get_existing_backend_configuration",
"get_existing_dataset_io_configurations",
"configure_backend",
"get_default_dataset_io_configurations",
"get_default_backend_configuration",
Expand All @@ -55,4 +59,5 @@
"get_module",
"make_nwbfile_from_metadata",
"make_or_load_nwbfile",
"repack_nwbfile",
]
26 changes: 25 additions & 1 deletion src/neuroconv/tools/nwb_helpers/_backend_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from typing import Literal, Union

from pynwb import NWBFile
from hdmf_zarr import NWBZarrIO
from pynwb import NWBHDF5IO, NWBFile

from ._configuration_models._hdf5_backend import HDF5BackendConfiguration
from ._configuration_models._zarr_backend import ZarrBackendConfiguration

BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration)
BACKEND_NWB_IO = dict(hdf5=NWBHDF5IO, zarr=NWBZarrIO)


def get_default_backend_configuration(
Expand All @@ -17,3 +19,25 @@ def get_default_backend_configuration(

BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile)


def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
"""Fill an existing backend configuration to serve as a starting point for further customization.

Parameters
----------
nwbfile : NWBFile
The NWBFile object to extract the backend configuration from. The nwbfile must have been read from an io object
to work properly.

Returns
-------
Union[HDF5BackendConfiguration, ZarrBackendConfiguration]
The backend configuration extracted from the nwbfile.
"""
read_io = nwbfile.read_io
for backend, io in BACKEND_NWB_IO.items():
if isinstance(read_io, io):
break
BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, use_default_dataset_io_configurations=False)
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

from ._base_dataset_io import DatasetIOConfiguration
from ._pydantic_pure_json_schema_generator import PureJSONSchemaGenerator
from .._dataset_configuration import get_default_dataset_io_configurations
from .._dataset_configuration import (
get_default_dataset_io_configurations,
get_existing_dataset_io_configurations,
)


class BackendConfiguration(BaseModel):
Expand Down Expand Up @@ -56,11 +59,31 @@ def model_json_schema(cls, **kwargs) -> dict[str, Any]:
return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs)

@classmethod
def from_nwbfile(cls, nwbfile: NWBFile) -> Self:
default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
def from_nwbfile(cls, nwbfile: NWBFile, use_default_dataset_io_configurations: bool = True) -> Self:
"""
Create a backend configuration from an NWBFile.

Parameters
----------
nwbfile : pynwb.NWBFile
The NWBFile object to extract the backend configuration from.
use_default_dataset_io_configurations : bool, optional
Whether to use default dataset configurations, by default True. If False, the existing dataset
configurations in the NWBFile will be used, which requires that the NWBFile was read from an io object.

Returns
-------
Self
The backend configuration extracted from the NWBFile.
"""

if use_default_dataset_io_configurations:
dataset_io_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
else:
dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
dataset_configurations = {
default_dataset_configuration.location_in_file: default_dataset_configuration
for default_dataset_configuration in default_dataset_configurations
for default_dataset_configuration in dataset_io_configurations
}

return cls(dataset_configurations=dataset_configurations)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ def __str__(self) -> str:
"""
size_in_bytes = math.prod(self.full_shape) * self.dtype.itemsize
maximum_ram_usage_per_iteration_in_bytes = math.prod(self.buffer_shape) * self.dtype.itemsize
disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize

string = (
f"\n{self.location_in_file}"
Expand All @@ -158,10 +157,14 @@ def __str__(self) -> str:
f"\n buffer shape : {self.buffer_shape}"
f"\n expected RAM usage : {human_readable_size(maximum_ram_usage_per_iteration_in_bytes)}"
"\n"
f"\n chunk shape : {self.chunk_shape}"
f"\n disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
"\n"
)
if self.chunk_shape is not None:
disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
string += (
f"\n chunk shape : {self.chunk_shape}"
f"\n disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
"\n"
)
if self.compression_method is not None:
string += f"\n compression method : {self.compression_method}"
if self.compression_options is not None:
Expand All @@ -181,9 +184,9 @@ def validate_all_shapes(cls, values: dict[str, Any]) -> dict[str, Any]:
dataset_name == location_in_file.split("/")[-1]
), f"The `dataset_name` ({dataset_name}) does not match the end of the `location_in_file` ({location_in_file})!"

chunk_shape = values["chunk_shape"]
buffer_shape = values["buffer_shape"]
full_shape = values["full_shape"]
chunk_shape = values["chunk_shape"] if values["chunk_shape"] is not None else full_shape
buffer_shape = values["buffer_shape"] if values["buffer_shape"] is not None else full_shape

if len(chunk_shape) != len(buffer_shape):
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
from typing import Any, Literal, Union

import h5py
import numpy as np
from hdmf import Container
from pydantic import Field, InstanceOf
from typing_extensions import Self

from ._base_dataset_io import DatasetIOConfiguration
from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile
from ...hdmf import SliceableDataChunkIterator
from ...importing import is_package_installed

_base_hdf5_filters = set(h5py.filters.decode)
Expand Down Expand Up @@ -78,3 +82,37 @@ def get_data_io_kwargs(self) -> dict[str, Any]:
compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts)

return dict(chunks=self.chunk_shape, **compression_bundle)

@classmethod
def from_neurodata_object(
cls,
neurodata_object: Container,
dataset_name: Literal["data", "timestamps"],
use_default_dataset_io_configuration: bool = True,
) -> Self:
if use_default_dataset_io_configuration:
return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name)

location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name)
full_shape = getattr(neurodata_object, dataset_name).shape
dtype = getattr(neurodata_object, dataset_name).dtype
chunk_shape = getattr(neurodata_object, dataset_name).chunks
buffer_chunk_shape = chunk_shape or full_shape
buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape(
buffer_gb=0.5, chunk_shape=buffer_chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype)
)
compression_method = getattr(neurodata_object, dataset_name).compression
compression_opts = getattr(neurodata_object, dataset_name).compression_opts
compression_options = dict(compression_opts=compression_opts)
return cls(
object_id=neurodata_object.object_id,
object_name=neurodata_object.name,
location_in_file=location_in_file,
dataset_name=dataset_name,
full_shape=full_shape,
dtype=dtype,
chunk_shape=chunk_shape,
buffer_shape=buffer_shape,
compression_method=compression_method,
compression_options=compression_options,
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
from typing import Any, Literal, Union

import numcodecs
import numpy as np
import zarr
from hdmf import Container
from pydantic import Field, InstanceOf, model_validator
from typing_extensions import Self

from ._base_dataset_io import DatasetIOConfiguration
from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile
from ...hdmf import SliceableDataChunkIterator

_base_zarr_codecs = set(zarr.codec_registry.keys())
_lossy_zarr_codecs = set(("astype", "bitround", "quantize"))
Expand Down Expand Up @@ -130,3 +134,36 @@ def get_data_io_kwargs(self) -> dict[str, Any]:
compressor = False

return dict(chunks=self.chunk_shape, filters=filters, compressor=compressor)

@classmethod
def from_neurodata_object(
cls,
neurodata_object: Container,
dataset_name: Literal["data", "timestamps"],
use_default_dataset_io_configuration: bool = True,
) -> Self:
if use_default_dataset_io_configuration:
return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name)

location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name)
full_shape = getattr(neurodata_object, dataset_name).shape
dtype = getattr(neurodata_object, dataset_name).dtype
chunk_shape = getattr(neurodata_object, dataset_name).chunks
buffer_chunk_shape = chunk_shape or full_shape
buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape(
buffer_gb=0.5, chunk_shape=buffer_chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype)
)
compression_method = getattr(neurodata_object, dataset_name).compressor
filter_methods = getattr(neurodata_object, dataset_name).filters
return cls(
object_id=neurodata_object.object_id,
object_name=neurodata_object.name,
location_in_file=location_in_file,
dataset_name=dataset_name,
full_shape=full_shape,
dtype=dtype,
chunk_shape=chunk_shape,
buffer_shape=buffer_shape,
compression_method=compression_method,
filter_methods=filter_methods,
)
15 changes: 12 additions & 3 deletions src/neuroconv/tools/nwb_helpers/_configure_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Union

from hdmf.common import Data
from hdmf.data_utils import DataChunkIterator
from pynwb import NWBFile, TimeSeries

from ._configuration_models._hdf5_backend import HDF5BackendConfiguration
Expand Down Expand Up @@ -46,16 +47,24 @@ def configure_backend(

# Table columns
if isinstance(neurodata_object, Data):
neurodata_object.set_data_io(data_io_class=data_io_class, data_io_kwargs=data_io_kwargs)
neurodata_object.set_data_io(
data_io_class=data_io_class, data_io_kwargs=data_io_kwargs, data_chunk_iterator_class=DataChunkIterator
)
# TimeSeries data or timestamps
elif isinstance(neurodata_object, TimeSeries) and not is_dataset_linked:
neurodata_object.set_data_io(
dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs
dataset_name=dataset_name,
data_io_class=data_io_class,
data_io_kwargs=data_io_kwargs,
data_chunk_iterator_class=DataChunkIterator,
)
# Special ndx-events v0.2.0 types
elif is_ndx_events_installed and isinstance(neurodata_object, ndx_events.Events):
neurodata_object.set_data_io(
dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs
dataset_name=dataset_name,
data_io_class=data_io_class,
data_io_kwargs=data_io_kwargs,
data_chunk_iterator_class=DataChunkIterator,
)
# But temporarily skipping LabeledEvents
elif is_ndx_events_installed and isinstance(neurodata_object, ndx_events.LabeledEvents):
Expand Down
79 changes: 79 additions & 0 deletions src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,82 @@ def get_default_dataset_io_configurations(
)

yield dataset_io_configuration


def get_existing_dataset_io_configurations(
nwbfile: NWBFile,
backend: Literal["hdf5", "zarr"],
) -> Generator[DatasetIOConfiguration, None, None]:
"""
Generate DatasetIOConfiguration objects for each neurodata object in an nwbfile.

Parameters
----------
nwbfile : pynwb.NWBFile
An NWBFile object that has been read from an existing file with an existing backend configuration.
backend : "hdf5" or "zarr"
Which backend format type you would like to use in configuring each dataset's compression methods and options.

Yields
------
DatasetIOConfiguration
A configuration object for each dataset in the NWB file.
"""

DatasetIOConfigurationClass = DATASET_IO_CONFIGURATIONS[backend]

known_dataset_fields = ("data", "timestamps")
for neurodata_object in nwbfile.objects.values():
if isinstance(neurodata_object, DynamicTable):
dynamic_table = neurodata_object # For readability

for column in dynamic_table.columns:
candidate_dataset = column.data # VectorData object

# Skip over columns whose values are links, such as the 'group' of an ElectrodesTable
if any(isinstance(value, Container) for value in candidate_dataset):
continue # Skip

# Skip when columns whose values are a reference type
if isinstance(column, TimeSeriesReferenceVectorData):
continue

# Skip datasets with any zero-length axes
dataset_name = "data"
candidate_dataset = getattr(column, dataset_name)
full_shape = get_data_shape(data=candidate_dataset)
if any(axis_length == 0 for axis_length in full_shape):
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
neurodata_object=column,
dataset_name=dataset_name,
use_default_dataset_io_configuration=False,
)

yield dataset_io_configuration
elif isinstance(neurodata_object, NWBContainer):
for known_dataset_field in known_dataset_fields:
# Skip optional fields that aren't present
if known_dataset_field not in neurodata_object.fields:
continue

candidate_dataset = getattr(neurodata_object, known_dataset_field)

# Skip edge case of in-memory ImageSeries with external mode; data is in fields and is empty array
if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0:
continue

# Skip datasets with any zero-length axes
candidate_dataset = getattr(neurodata_object, known_dataset_field)
full_shape = get_data_shape(data=candidate_dataset)
if any(axis_length == 0 for axis_length in full_shape):
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
neurodata_object=neurodata_object,
dataset_name=known_dataset_field,
use_default_dataset_io_configuration=False,
)

yield dataset_io_configuration
Loading
Loading