Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix HDFVirtualBackend handling of non coordinate dimension HDF datasets. #410

Merged
merged 5 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions virtualizarr/readers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,10 @@ def _virtual_vars_from_hdf(
group_name = "/"

variables = {}
non_coordinate_dimesion_vars = HDFVirtualBackend._find_non_coord_dimension_vars(
group=g
)
Comment on lines +374 to +376
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: typo (dimesion -> dimension)

Suggested change
non_coordinate_dimesion_vars = HDFVirtualBackend._find_non_coord_dimension_vars(
group=g
)
non_coordinate_dimension_vars = (
HDFVirtualBackend._find_non_coord_dimension_vars(group=g)
)

drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick but I think this is more legible and faster:

Suggested change
drop_variables = list(set(drop_variables + non_coordinate_dimesion_vars))
drop_variables = [*drop_variables, *non_coordinate_dimension_vars]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Speed of this is not important, and I think the first one makes intent clearer (if the intent is to de-duplicate using set?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I was getting too nitpicky. Just the typo then :)

for key in g.keys():
if key not in drop_variables:
if isinstance(g[key], h5py.Dataset):
Expand Down Expand Up @@ -403,3 +407,17 @@ def _get_group_attrs(
g = f
attrs = HDFVirtualBackend._extract_attrs(g)
return attrs

@staticmethod
def _find_non_coord_dimension_vars(group: H5Group) -> List[str]:
dimension_names = []
non_coordinate_dimension_variables = []
for name, obj in group.items():
if "_Netcdf4Dimid" in obj.attrs:
dimension_names.append(name)
for name, obj in group.items():
if type(obj) is h5py.Dataset:
if obj.id.get_storage_size() == 0 and name in dimension_names:
non_coordinate_dimension_variables.append(name)

return non_coordinate_dimension_variables
12 changes: 2 additions & 10 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,6 @@ def test_var_attr_coords(self, netcdf4_file_with_2d_coords, hdf_backend):
+ expected_2d_coords
+ expected_1d_non_dimension_coords
+ expected_scalar_coords
# These should not be included in coords see #401 for more information
+ (["xi_rho", "eta_rho"] if hdf_backend == HDFVirtualBackend else [])
)
assert set(vds.coords) == set(expected_coords)

Expand Down Expand Up @@ -344,10 +342,7 @@ def test_open_subgroup(
indexes={},
backend=hdf_backend,
)
# This should just be ["bar"] see #401 for more information
assert list(vds.variables) == (
["bar", "dim_0"] if hdf_backend == HDFVirtualBackend else ["bar"]
)
assert list(vds.variables) == ["bar"]
assert isinstance(vds["bar"].data, ManifestArray)
assert vds["bar"].shape == (2,)

Expand All @@ -364,10 +359,7 @@ def test_open_root_group(
indexes={},
backend=hdf_backend,
)
# This should just be ["foo"] see #401 for more information
assert list(vds.variables) == (
["foo", "dim_0"] if hdf_backend == HDFVirtualBackend else ["foo"]
)
assert list(vds.variables) == ["foo"]
assert isinstance(vds["foo"].data, ManifestArray)
assert vds["foo"].shape == (3,)

Expand Down
9 changes: 9 additions & 0 deletions virtualizarr/tests/test_readers/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,12 @@ def netcdf3_file(tmp_path: pathlib.Path) -> pathlib.Path:
ds.to_netcdf(filepath, format="NETCDF3_CLASSIC")

return filepath


@pytest.fixture
def non_coord_dim(tmpdir):
filepath = f"{tmpdir}/non_coord_dim.nc"
ds = create_test_data(dim_sizes=(20, 80, 10))
ds = ds.drop_dims("dim3")
ds.to_netcdf(filepath, engine="netcdf4")
return filepath
13 changes: 10 additions & 3 deletions virtualizarr/tests/test_readers/test_hdf/test_hdf_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ def test_filters_h5netcdf_roundtrip(
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
xrt.assert_allclose(ds, roundtrip)

@pytest.mark.xfail(
reason="Coordinate issue affecting only hdf reader see pull/#260"
)
def test_filters_netcdf4_roundtrip(
self, tmpdir, filter_encoded_roundtrip_netcdf4_file
):
Expand All @@ -50,3 +47,13 @@ def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_allclose(ds, roundtrip)

def test_non_coord_dim(self, tmpdir, non_coord_dim):
ds = xr.open_dataset(non_coord_dim)
vds = virtualizarr.open_virtual_dataset(
non_coord_dim, backend=HDFVirtualBackend
)
kerchunk_file = f"{tmpdir}/kerchunk.json"
vds.virtualize.to_kerchunk(kerchunk_file, format="json")
roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
xrt.assert_equal(ds, roundtrip)
6 changes: 3 additions & 3 deletions virtualizarr/tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,11 +322,11 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(
@requires_imagecodecs
def test_nbytes(simple_netcdf4):
vds = open_virtual_dataset(simple_netcdf4)
assert vds.virtualize.nbytes == 88
assert vds.nbytes == 104
assert vds.virtualize.nbytes == 32
assert vds.nbytes == 48

vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"])
assert vds.virtualize.nbytes == 104
assert vds.virtualize.nbytes == 48
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👏🏻


ds = open_dataset(simple_netcdf4)
assert ds.virtualize.nbytes == ds.nbytes