Skip to content

Commit

Permalink
First ideas of validation based on hdf tree traversal
Browse files Browse the repository at this point in the history
  • Loading branch information
domna committed Jun 7, 2024
1 parent c6e12a8 commit a96d2d2
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 4 deletions.
11 changes: 10 additions & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ babel==2.14.0
# via mkdocs-material
build==1.1.1
# via pip-tools
cachetools==5.3.3
# via pynxtools (pyproject.toml)
certifi==2024.2.2
# via requests
cfgv==3.4.0
Expand Down Expand Up @@ -143,6 +145,10 @@ pluggy==1.4.0
# via pytest
pre-commit==3.7.0
# via pynxtools (pyproject.toml)
pydantic==2.7.1
# via pynxtools (pyproject.toml)
pydantic-core==2.18.2
# via pydantic
pygments==2.17.2
# via mkdocs-material
pymdown-extensions==10.7.1
Expand Down Expand Up @@ -211,7 +217,10 @@ types-pyyaml==6.0.12.20240311
types-requests==2.31.0.20240311
# via pynxtools (pyproject.toml)
typing-extensions==4.10.0
# via mypy
# via
# mypy
# pydantic
# pydantic-core
tzdata==2024.1
# via pandas
urllib3==2.2.1
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"importlib-metadata",
"lxml>=4.9.1",
"anytree",
"cachetools",
]

[project.urls]
Expand Down
89 changes: 86 additions & 3 deletions src/pynxtools/dataconverter/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@
from collections import defaultdict
from functools import reduce
from operator import getitem
from typing import Any, Iterable, List, Mapping, Optional, Tuple, Union
from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union

import h5py
import lxml.etree as ET
import numpy as np
from anytree import Resolver
from cachetools import LRUCache, cached
from cachetools.keys import hashkey

from pynxtools.dataconverter.helpers import (
Collector,
Expand All @@ -42,20 +44,101 @@
from pynxtools.definitions.dev_tools.utils.nxdl_utils import get_nx_namefit


def best_namefit_of_(
name: str, concepts: Set[str], nx_class: Optional[str] = None
) -> str:
# TODO: Find the best namefit of name in concepts
# Consider nx_class if it is not None
...


def validate_hdf_group_against(appdef: str, data: h5py.Group):
"""
Checks whether all the required paths from the template are returned in data dict.
THIS IS JUST A FUNCTION SKELETON AND IS NOT WORKING YET!
"""

def validate(name: str, data: Union[h5py.Group, h5py.Dataset]):
# Only cache based on path. That way we retain the nx_class information
# in the tree
# Allow for 10000 cache entries. This should be enough for most cases
@cached(
cache=LRUCache(maxsize=10000),
key=lambda path, _: hashkey(path),
)
def find_node_for(path: str, nx_class: Optional[str] = None) -> Optional[NexusNode]:
if path == "":
return tree

prev_path, last_elem = path.rsplit("/", 1)
node = find_node_for(prev_path)

best_child = best_namefit_of_(
last_elem,
# TODO: Consider renaming `get_all_children_names` to
# `get_all_direct_children_names`. Because that's what it is.
node.get_all_children_names(),
nx_class,
)
if best_child is None:
return None

return node.search_child_with_name(best_child)

def remove_from_req_fields(path: str):
if path in required_fields:
required_fields.remove(path)

def handle_group(path: str, data: h5py.Group):
node = find_node_for(path, data.attrs.get("NX_class"))
if node is None:
# TODO: Log undocumented
return

# TODO: Do actual group checks

def handle_field(path: str, data: h5py.Dataset):
node = find_node_for(path)
if node is None:
# TODO: Log undocumented
return
remove_from_req_fields(f"{path}")

# TODO: Do actual field checks

def handle_attributes(path: str, attribute_names: h5py.AttributeManager):
for attr_name in attribute_names:
node = find_node_for(f"{path}/{attr_name}")
if node is None:
# TODO: Log undocumented
continue
remove_from_req_fields(f"{path}/@{attr_name}")

# TODO: Do actual attribute checks

def validate(path: str, data: Union[h5py.Group, h5py.Dataset]):
# Namefit name against tree (use recursive caching)
pass
if isinstance(data, h5py.Group):
handle_group(path, data)
elif isinstance(data, h5py.Dataset):
handle_field(path, data)

handle_attributes(path, data.attrs)

tree = generate_tree_from(appdef)
required_fields = tree.required_fields_and_attrs_names()
data.visitems(validate)

for req_field in required_fields:
if "@" in req_field:
collector.collect_and_log(
req_field, ValidationProblem.MissingRequiredAttribute, None
)
continue
collector.collect_and_log(
req_field, ValidationProblem.MissingRequiredField, None
)


def build_nested_dict_from(
mapping: Mapping[str, Any],
Expand Down

0 comments on commit a96d2d2

Please sign in to comment.