diff --git a/src/earthkit/data/readers/grib/xarray.py b/src/earthkit/data/readers/grib/xarray.py index 9d6e8a26..2d71698a 100644 --- a/src/earthkit/data/readers/grib/xarray.py +++ b/src/earthkit/data/readers/grib/xarray.py @@ -231,10 +231,12 @@ def to_xarray(self, engine=None, xarray_open_dataset_kwargs=None, **kwargs): - "fixed": Use the attributes defined in ``variable_attrs`` as variables attributes and ``global_attrs`` as global attributes. - "unique": Use all the attributes defined in ``attrs``, ``variable_attrs`` - and ``global_attrs``. When an attribute has unique a value for a dataset + and ``global_attrs``. When an attribute has unique value for a dataset it will be a global attribute, otherwise it will be a variable attribute. - However keys in ``variable_attrs`` are always used as variable attributes, - while keys in ``global_attrs`` are always used as global attributes. + However, this logic is only applied if a unique variable attribute can be + a global attribute according to the CF conventions Appendix A. (e.g. "units" cannot + be a global attribute). Additionally, keys in ``variable_attrs`` are always used as + variable attributes, while keys in ``global_attrs`` are always used as global attributes. * attrs: str, number, callable, dict or list of these, None Attribute or list of attributes. Only used when ``attrs_mode`` is ``unique``. Its default value (None) expands to [] unless the ``profile`` overwrites it. diff --git a/src/earthkit/data/utils/xarray/attrs.py b/src/earthkit/data/utils/xarray/attrs.py index 2515016f..6762fd00 100644 --- a/src/earthkit/data/utils/xarray/attrs.py +++ b/src/earthkit/data/utils/xarray/attrs.py @@ -8,9 +8,11 @@ # import logging +import os from abc import ABCMeta from abc import abstractmethod from collections import defaultdict +from functools import cached_property from earthkit.data.utils import ensure_dict from earthkit.data.utils import ensure_iterable @@ -18,6 +20,36 @@ LOG = logging.getLogger(__name__) +class CFAttrs: + def _load(self): + here = os.path.dirname(__file__) + path = os.path.join(here, "cf_attrs.yaml") + if os.path.exists(path): + import yaml + + try: + with open(path, "r") as f: + return yaml.safe_load(f) + except Exception as e: + LOG.exception(f"Failed read CF attributes file {path}. {e}") + raise + else: + raise ValueError(f"CF attributes file not found! path={path}") + + @cached_property + def attrs(self): + return self._load() + + def can_be_global(self, name): + item = self.attrs.get(name, None) + if item: + return "G" in item["use"] + return True + + +CF_ATTRS = CFAttrs() + + class Attr: """Generic attribute class. @@ -238,7 +270,7 @@ def _id(x): global_attrs[item.name] = item.value() # TODO: make it optional - global_attrs.pop("units", None) + # global_attrs.pop("units", None) return global_attrs @@ -257,13 +289,24 @@ def _build(self, ds, t_vars, rename=None): if len(v) == 1 and k not in self.attrs.variable_attrs: global_attrs[k] = list(v)[0] - for var_obj in t_vars.values(): - var_obj.adjust_attrs(drop_keys=global_attrs.keys(), rename=rename) + # Some attrs cannot be global according to the CF convention. + # These are removed from global attrs and kept as variable attrs. + global_attrs_keys = list(global_attrs.keys()) + global_attrs_renamed_keys = global_attrs_keys + if rename: + global_attrs_renamed_keys = list(rename(global_attrs).keys()) + + for k1, k2 in zip(global_attrs_keys, global_attrs_renamed_keys): + if not CF_ATTRS.can_be_global(k1) or not CF_ATTRS.can_be_global(k2): + global_attrs.pop(k1) for k in self.attrs.variable_attrs: if k in global_attrs: global_attrs.pop(k) + for var_obj in t_vars.values(): + var_obj.adjust_attrs(drop_keys=global_attrs.keys(), rename=rename) + global_attrs = {k: v for k, v in global_attrs.items() if v is not None} return global_attrs diff --git a/src/earthkit/data/utils/xarray/cf_attrs.yaml b/src/earthkit/data/utils/xarray/cf_attrs.yaml new file mode 100644 index 00000000..03767eb3 --- /dev/null +++ b/src/earthkit/data/utils/xarray/cf_attrs.yaml @@ -0,0 +1,195 @@ +# Based on CF Conventions Appendix A +# All CF attributes are listed here except for those that are used to describe grid mappings. See Appendix F for the grid mapping attributes. +# The 'Type' values are S for string, N for numeric, and D for the type of the data variable. +# The 'Use' values are G for global, C for variables containing coordinate data, and D for variables containing non-coordinate data +Conventions: + type: S + use: G +_FillValue: + type: D + use: + - C + - D +actual_range: + type: N + use: + - C + - D +add_offset: + type: N + use: + - C + - D +ancillary_variables: + type: S + use: D +axis: + type: S + use: C +bounds: + type: S + use: C +calendar: + type: S + use: C +cell_measures: + type: S + use: D +cell_methods: + type: S + use: D +cf_role: + type: S + use: C +climatology: + type: S + use: C +comment: + type: S + use: + - G + - C + - D +compress: + type: S + use: C +computed_standard_name: + type: S + use: C +coordinates: + type: S + use: + - D + - M +external_variables: + type: S + use: G +featureType: + type: S + use: G +flag_masks: + type: D + use: D +flag_meanings: + type: S + use: D +flag_values: + type: D + use: D +formula_terms: + type: S + use: C +geometry: + type: S + use: + - C + - D +geometry_type: + type: S + use: M +grid_mapping: + type: S + use: + - D + - M +history: + type: S + use: + - G + - Gr +instance_dimension: + type: S + use: "-" +institution: + type: S + use: + - G + - D +interior_ring: + type: S + use: M +leap_month: + type: N + use: C +leap_year: + type: N + use: C +long_name: + type: S + use: + - C + - D +missing_value: + type: D + use: + - C + - D +month_lengths: + type: N + use: C +node_coordinates: + type: S + use: M +node_count: + type: S + use: M +nodes: + type: S + use: C +part_node_count: + type: S + use: M +positive: + type: S + use: C +references: + type: S + use: + - G + - D +sample_dimension: + type: S + use: "-" +scale_factor: + type: N + use: + - C + - D +source: + type: S + use: + - G + - D +standard_error_multiplier: + type: N + use: D +standard_name: + type: S + use: + - C + - D +title: + type: S + use: + - G + - Gr +units: + type: S + use: + - C + - D +valid_max: + type: N + use: + - C + - D +valid_min: + type: N + use: + - C + - D +valid_range: + type: N + use: + - C + - D diff --git a/src/earthkit/data/utils/xarray/engine.py b/src/earthkit/data/utils/xarray/engine.py index 67304b03..1dfaa20f 100644 --- a/src/earthkit/data/utils/xarray/engine.py +++ b/src/earthkit/data/utils/xarray/engine.py @@ -188,10 +188,12 @@ def open_dataset( - "fixed": Use the attributes defined in ``variable_attrs`` as variables attributes and ``global_attrs`` as global attributes. - "unique": Use all the attributes defined in ``attrs``, ``variable_attrs`` - and ``global_attrs``. When an attribute has unique a value for a dataset + and ``global_attrs``. When an attribute has unique value for a dataset it will be a global attribute, otherwise it will be a variable attribute. - However keys in ``variable_attrs`` are always used as variable attributes, - while keys in ``global_attrs`` are always used as global attributes. + However, this logic is only applied if a unique variable attribute can be + a global attribute according to the CF conventions Appendix A. (e.g. "units" cannot + be a global attribute). Additionally, keys in ``variable_attrs`` are always used as + variable attributes, while keys in ``global_attrs`` are always used as global attributes. attrs: str, number, callable, dict or list of these, None Attribute or list of attributes. Only used when ``attrs_mode`` is ``unique``. Its default value (None) expands to [] unless the ``profile`` overwrites it. diff --git a/tests/xr_engine/test_xr_engine.py b/tests/xr_engine/test_xr_engine.py index 4aa37946..59a9e45a 100644 --- a/tests/xr_engine/test_xr_engine.py +++ b/tests/xr_engine/test_xr_engine.py @@ -469,10 +469,14 @@ def test_xr_engine_single_field(): lats = np.linspace(90, -90, 19) lons = np.linspace(0, 350, 36) - attrs_ref = { - "param": "t", + var_attrs_ref = { "standard_name": "air_temperature", "long_name": "Temperature", + "units": "K", + } + + global_attrs_ref = { + "param": "t", "paramId": 130, "class": "od", "stream": "oper", @@ -488,7 +492,7 @@ def test_xr_engine_single_field(): "institution": "ECMWF", } - assert ds.attrs == attrs_ref + assert ds.attrs == global_attrs_ref data_vars = ["t"] @@ -510,6 +514,9 @@ def test_xr_engine_single_field(): da = ds["t"] + for k, v in var_attrs_ref.items(): + assert da.attrs[k] == v + r = da[:, :] r.shape == (19, 36) assert np.allclose(r.values, vals_ref)