Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve http url glob error #9930

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 25 additions & 21 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
NdimSizeLenMixin,
attempt_import,
emit_user_level_warning,
is_http_url,
is_remote_uri,
)
from xarray.namedarray.parallelcompat import get_chunked_array_type
Expand Down Expand Up @@ -143,28 +144,31 @@ def _find_absolute_paths(
['common.py']
"""
if isinstance(paths, str):
if is_remote_uri(paths) and kwargs.get("engine") == "zarr":
if TYPE_CHECKING:
import fsspec
if is_remote_uri(paths):
if not is_http_url(paths):
# remote uris that are not http are likely s3 or similar, which support globbing with fsspec

if TYPE_CHECKING:
import fsspec
else:
fsspec = attempt_import("fsspec")

fs, _, _ = fsspec.core.get_fs_token_paths(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably be url_to_fs these days, as it's more user-facing.

paths,
mode="rb",
storage_options=kwargs.get("backend_kwargs", {}).get(
"storage_options", {}
),
expand=False,
)
tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could pass detail=True, and then filter for directories

return [fs.get_mapper(path) for path in tmp_paths]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought zarr, in particular, didn't wan't mappers any more, but Store objects, or URLs that will be immediately converted to Stores within zarr anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jhamman has the recommendation changed here recently?

else:
fsspec = attempt_import("fsspec")

fs, _, _ = fsspec.core.get_fs_token_paths(
paths,
mode="rb",
storage_options=kwargs.get("backend_kwargs", {}).get(
"storage_options", {}
),
expand=False,
)
tmp_paths = fs.glob(fs._strip_protocol(paths)) # finds directories
return [fs.get_mapper(path) for path in tmp_paths]
elif is_remote_uri(paths):
raise ValueError(
"cannot do wild-card matching for paths that are remote URLs "
f"unless engine='zarr' is specified. Got paths: {paths}. "
"Instead, supply paths as an explicit list of strings."
)
raise ValueError(
"cannot do wild-card matching for paths that are remote http URLs. "
f"Got paths: {paths}. "
"Instead, supply paths as an explicit list of strings."
)
else:
return sorted(glob(_normalize_path(paths)))
elif isinstance(paths, os.PathLike):
Expand Down
10 changes: 9 additions & 1 deletion xarray/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,14 +617,22 @@ def close_on_error(f):


def is_remote_uri(path: str) -> bool:
"""Finds URLs of the form protocol:// or protocol::
"""Matches URLs of the form protocol:// or protocol::

This also matches for http[s]://, which were the only remote URLs
supported in <=v0.16.2.
"""
return bool(re.search(r"^[a-z][a-z0-9]*(\://|\:\:)", path))


def is_http_url(path: str) -> bool:
"""Matches URLs of the form http[s]://

Does not match for
"""
return bool(re.search(r"^https?\://", path))


def read_magic_number_from_file(filename_or_obj, count=8) -> bytes:
# check byte header to determine file type
if isinstance(filename_or_obj, bytes):
Expand Down
10 changes: 5 additions & 5 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -4741,11 +4741,11 @@ def test_open_mfdataset(self) -> None:

@requires_fsspec
def test_open_mfdataset_no_files(self) -> None:
pytest.importorskip("aiobotocore")

# glob is attempted as of #4823, but finds no files
with pytest.raises(OSError, match=r"no files"):
open_mfdataset("http://some/remote/uri", engine="zarr")
with pytest.raises(
ValueError,
match=r"cannot do wild-card matching for paths that are remote http URLs",
):
open_mfdataset("http://some/remote/uri")

def test_open_mfdataset_2d(self) -> None:
original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))})
Expand Down
9 changes: 9 additions & 0 deletions xarray/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,15 @@ def test_is_remote_uri():
assert utils.is_remote_uri("https://example.com")
assert not utils.is_remote_uri(" http://example.com")
assert not utils.is_remote_uri("example.nc")
assert utils.is_remote_uri("s3://bucket/example.nc")


def test_is_http_url():
assert utils.is_http_url("http://example.com")
assert utils.is_http_url("https://example.com")
assert not utils.is_http_url(" http://example.com")
assert not utils.is_http_url("example.nc")
assert not utils.is_http_url("s3://bucket/example.nc")


class Test_is_uniform_and_sorted:
Expand Down
Loading