diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8cbfd9e9089..427130d8432 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -75,9 +75,12 @@ Internal Changes within ``as_compatible_data``. This is consistent with how lists of these objects will be converted (:pull:`9900`). By `Kai Mühlbauer `_. +- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`). + By `Kai Mühlbauer `_. - Refactor of time coding to prepare for relaxing nanosecond restriction (:pull:`9906`). By `Kai Mühlbauer `_. + .. _whats-new.2024.11.0: v.2024.11.0 (Nov 22, 2024) diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py index e7eece7e81e..1d5b43a6da6 100644 --- a/properties/test_encode_decode.py +++ b/properties/test_encode_decode.py @@ -5,17 +5,22 @@ """ +import warnings + import pytest pytest.importorskip("hypothesis") # isort: split import hypothesis.extra.numpy as npst +import hypothesis.strategies as st import numpy as np from hypothesis import given import xarray as xr -from xarray.testing.strategies import variables +from xarray.coding.times import _parse_iso8601 +from xarray.testing.strategies import CFTimeStrategyISO8601, variables +from xarray.tests import requires_cftime @pytest.mark.slow @@ -43,3 +48,13 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None: coder = xr.coding.variables.CFScaleOffsetCoder() roundtripped = coder.decode(coder.encode(original)) xr.testing.assert_identical(original, roundtripped) + + +@requires_cftime +@given(dt=st.datetimes() | CFTimeStrategyISO8601()) +def test_iso8601_decode(dt): + iso = dt.isoformat() + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*") + parsed, _ = _parse_iso8601(type(dt), iso) + assert dt == parsed diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 2cd8eccd6f3..50b048a8e29 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -53,9 +53,10 @@ import pandas as pd from packaging.version import Version -from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso +from xarray.coding.cftimeindex import CFTimeIndex from xarray.coding.times import ( _is_standard_calendar, + _parse_iso8601, _should_cftime_be_used, convert_time_or_go_back, format_cftime_datetime, @@ -843,7 +844,7 @@ def to_cftime_datetime(date_str_or_date, calendar=None): "If converting a string to a cftime.datetime object, " "a calendar type must be provided" ) - date, _ = _parse_iso8601_with_reso(get_date_type(calendar), date_str_or_date) + date, _ = _parse_iso8601(get_date_type(calendar), date_str_or_date) return date elif isinstance(date_str_or_date, cftime.datetime): return date_str_or_date diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 0494952fc9c..596a51a0dcf 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -42,7 +42,6 @@ from __future__ import annotations import math -import re import warnings from datetime import timedelta from typing import TYPE_CHECKING, Any @@ -53,6 +52,7 @@ from xarray.coding.times import ( _STANDARD_CALENDARS, + _parse_iso8601, cftime_to_nptime, infer_calendar_name, ) @@ -78,71 +78,6 @@ OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,) -def named(name, pattern): - return "(?P<" + name + ">" + pattern + ")" - - -def optional(x): - return "(?:" + x + ")?" - - -def trailing_optional(xs): - if not xs: - return "" - return xs[0] + optional(trailing_optional(xs[1:])) - - -def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."): - pieces = [ - (None, "year", r"\d{4}"), - (date_sep, "month", r"\d{2}"), - (date_sep, "day", r"\d{2}"), - (datetime_sep, "hour", r"\d{2}"), - (time_sep, "minute", r"\d{2}"), - (time_sep, "second", r"\d{2}"), - (micro_sep, "microsecond", r"\d{1,6}"), - ] - pattern_list = [] - for sep, name, sub_pattern in pieces: - pattern_list.append((sep if sep else "") + named(name, sub_pattern)) - # TODO: allow timezone offsets? - return "^" + trailing_optional(pattern_list) + "$" - - -_BASIC_PATTERN = build_pattern(date_sep="", time_sep="") -_EXTENDED_PATTERN = build_pattern() -_CFTIME_PATTERN = build_pattern(datetime_sep=" ") -_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN] - - -def parse_iso8601_like(datetime_string): - for pattern in _PATTERNS: - match = re.match(pattern, datetime_string) - if match: - return match.groupdict() - raise ValueError( - f"no ISO-8601 or cftime-string-like match for string: {datetime_string}" - ) - - -def _parse_iso8601_with_reso(date_type, timestr): - _ = attempt_import("cftime") - - default = date_type(1, 1, 1) - result = parse_iso8601_like(timestr) - replace = {} - - for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: - value = result.get(attr, None) - if value is not None: - if attr == "microsecond": - # convert match string into valid microsecond value - value = 10 ** (6 - len(value)) * int(value) - replace[attr] = int(value) - resolution = attr - return default.replace(**replace), resolution - - def _parsed_string_to_bounds(date_type, resolution, parsed): """Generalization of pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds @@ -436,7 +371,7 @@ def _partial_date_slice(self, resolution, parsed): def _get_string_slice(self, key): """Adapted from pandas.tseries.index.DatetimeIndex._get_string_slice""" - parsed, resolution = _parse_iso8601_with_reso(self.date_type, key) + parsed, resolution = _parse_iso8601(self.date_type, key) try: loc = self._partial_date_slice(resolution, parsed) except KeyError as err: @@ -483,7 +418,7 @@ def _maybe_cast_slice_bound(self, label, side): if not isinstance(label, str): return label - parsed, resolution = _parse_iso8601_with_reso(self.date_type, label) + parsed, resolution = _parse_iso8601(self.date_type, label) start, end = _parsed_string_to_bounds(self.date_type, resolution, parsed) if self.is_monotonic_decreasing and len(self) > 1: return end if side == "left" else start @@ -811,11 +746,6 @@ def is_leap_year(self): return func(self.year, calendar=self.calendar) -def _parse_iso8601_without_reso(date_type, datetime_str): - date, _ = _parse_iso8601_with_reso(date_type, datetime_str) - return date - - def _parse_array_of_cftime_strings(strings, date_type): """Create a numpy array from an array of strings. @@ -833,9 +763,9 @@ def _parse_array_of_cftime_strings(strings, date_type): ------- np.array """ - return np.array( - [_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()] - ).reshape(strings.shape) + return np.array([_parse_iso8601(date_type, s)[0] for s in strings.ravel()]).reshape( + strings.shape + ) def _contains_datetime_timedeltas(array): diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 326682aab56..26840c6fd22 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -193,17 +193,87 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]: return delta_units, ref_date -def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp: +def named(name: str, pattern: str) -> str: + return "(?P<" + name + ">" + pattern + ")" + + +def optional(x: str) -> str: + return "(?:" + x + ")?" + + +def trailing_optional(xs: list[str]) -> str: + if not xs: + return "" + return xs[0] + optional(trailing_optional(xs[1:])) + + +def build_pattern( + date_sep: str = r"\-", + datetime_sep: str = r"T", + time_sep: str = r"\:", + micro_sep: str = r".", +) -> str: + pieces = [ + (None, "year", r"[+-]?\d{4,5}"), + (date_sep, "month", r"\d{2}"), + (date_sep, "day", r"\d{2}"), + (datetime_sep, "hour", r"\d{2}"), + (time_sep, "minute", r"\d{2}"), + (time_sep, "second", r"\d{2}"), + (micro_sep, "microsecond", r"\d{1,6}"), + ] + pattern_list = [] + for sep, name, sub_pattern in pieces: + pattern_list.append((sep if sep else "") + named(name, sub_pattern)) + # TODO: allow timezone offsets? + return "^" + trailing_optional(pattern_list) + "$" + + +_BASIC_PATTERN = build_pattern(date_sep="", time_sep="") +_EXTENDED_PATTERN = build_pattern() +_CFTIME_PATTERN = build_pattern(datetime_sep=" ") +_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN] + + +def parse_iso8601_like(datetime_string: str) -> dict[str, str | None]: + for pattern in _PATTERNS: + match = re.match(pattern, datetime_string) + if match: + return match.groupdict() + raise ValueError( + f"no ISO-8601 or cftime-string-like match for string: {datetime_string}" + ) + + +def _parse_iso8601(date_type, timestr): + default = date_type(1, 1, 1) + result = parse_iso8601_like(timestr) + replace = {} + + for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]: + value = result.get(attr, None) + if value is not None: + resolution = attr + if attr == "microsecond": + if len(value) <= 3: + resolution = "millisecond" + # convert match string into valid microsecond value + value = 10 ** (6 - len(value)) * int(value) + replace[attr] = int(value) + return default.replace(**replace), resolution + + + def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp: # If the ref_date Timestamp is timezone-aware, convert to UTC and # make it timezone-naive (GH 2649). if date.tz is not None: return date.tz_convert("UTC").tz_convert(None) - return date - - + return date + + def _unpack_time_unit_and_ref_date( units: str, -) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]: +) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]: # same us _unpack_netcdf_time_units but finalizes ref_date for # processing in encode_cf_datetime time_unit, _ref_date = _unpack_netcdf_time_units(units) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 2b992f0249b..11f9ee49ca2 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -645,7 +645,7 @@ def try_read_magic_number_from_path(pathlike, count=8) -> bytes | None: try: with open(path, "rb") as f: return read_magic_number_from_file(f, count) - except (FileNotFoundError, TypeError): + except (FileNotFoundError, IsADirectoryError, TypeError): pass return None diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index cfa226d991c..e60572fbddd 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -1,3 +1,5 @@ +import datetime +import warnings from collections.abc import Hashable, Iterable, Mapping, Sequence from typing import TYPE_CHECKING, Any, Protocol, overload @@ -473,3 +475,36 @@ def unique_subset_of( return ( {k: objs[k] for k in subset_keys} if isinstance(objs, Mapping) else subset_keys ) + + +class CFTimeStategy(st.SearchStrategy): + def __init__(self, min_value, max_value): + self.min_value = min_value + self.max_value = max_value + + def do_draw(self, data): + unit_microsecond = datetime.timedelta(microseconds=1) + timespan_microseconds = (self.max_value - self.min_value) // unit_microsecond + result = data.draw_integer(0, timespan_microseconds) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*") + return self.min_value + datetime.timedelta(microseconds=result) + + +class CFTimeStrategyISO8601(st.SearchStrategy): + def __init__(self): + from xarray.tests.test_coding_times import _all_cftime_date_types + + self.date_types = _all_cftime_date_types() + self.calendars = list(self.date_types) + + def do_draw(self, data): + calendar = data.draw(st.sampled_from(self.calendars)) + date_type = self.date_types[calendar] + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*") + daysinmonth = date_type(99999, 12, 1).daysinmonth + min_value = date_type(-99999, 1, 1) + max_value = date_type(99999, 12, daysinmonth, 23, 59, 59, 999999) + strategy = CFTimeStategy(min_value, max_value) + return strategy.do_draw(data) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index d1fccc52a9a..2f527bf298e 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -12,9 +12,11 @@ from xarray.coding.cftimeindex import ( CFTimeIndex, _parse_array_of_cftime_strings, - _parse_iso8601_with_reso, _parsed_string_to_bounds, assert_all_valid_date_type, +) +from xarray.coding.times import ( + _parse_iso8601, parse_iso8601_like, ) from xarray.tests import ( @@ -132,16 +134,34 @@ def date_dict( list(ISO8601_LIKE_STRING_TESTS.values()), ids=list(ISO8601_LIKE_STRING_TESTS.keys()), ) -def test_parse_iso8601_like(string, expected): - result = parse_iso8601_like(string) +@pytest.mark.parametrize( + "five_digit_year", [False, True], ids=["four-digit-year", "five-digit-year"] +) +@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"]) +def test_parse_iso8601_like( + five_digit_year: bool, sign: str, string: str, expected: dict +) -> None: + pre = "1" if five_digit_year else "" + datestring = sign + pre + string + result = parse_iso8601_like(datestring) + expected = expected.copy() + expected.update(year=sign + pre + expected["year"]) assert result == expected - if result["microsecond"] is None: + # check malformed single digit addendum + # this check is only performed when we have at least "hour" given + # like "1999010101", where a single added digit should raise + # for "1999" (year), "199901" (month) and "19990101" (day) + # and a single added digit the string would just be interpreted + # as having a 5-digit year. + if result["microsecond"] is None and result["hour"] is not None: with pytest.raises(ValueError): - parse_iso8601_like(string + "3") - if result["second"] is None: + parse_iso8601_like(datestring + "3") + + # check malformed floating point addendum + if result["second"] is None or result["microsecond"] is not None: with pytest.raises(ValueError): - parse_iso8601_like(string + ".3") + parse_iso8601_like(datestring + ".3") _CFTIME_CALENDARS = [ @@ -348,7 +368,7 @@ def test_cftimeindex_days_in_month_accessor(index): def test_parse_iso8601_with_reso(date_type, string, date_args, reso): expected_date = date_type(*date_args) expected_reso = reso - result_date, result_reso = _parse_iso8601_with_reso(date_type, string) + result_date, result_reso = _parse_iso8601(date_type, string) assert result_date == expected_date assert result_reso == expected_reso