From 4a574e187a0a827280681debec68fc0cf65a890f Mon Sep 17 00:00:00 2001 From: matthias muntwiler Date: Fri, 16 Feb 2024 16:51:50 +0100 Subject: [PATCH 1/4] Date and time parsing for json_map reader --- .../dataconverter/readers/json_map/reader.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/pynxtools/dataconverter/readers/json_map/reader.py b/pynxtools/dataconverter/readers/json_map/reader.py index 80d97fad3..df4d5c339 100644 --- a/pynxtools/dataconverter/readers/json_map/reader.py +++ b/pynxtools/dataconverter/readers/json_map/reader.py @@ -17,9 +17,13 @@ # """An example reader implementation for the DataConverter.""" from typing import Tuple, Any +import datetime +import dateutil.parser +import dateutil.tz import json import pickle import numpy as np +import re import xarray from mergedeep import merge @@ -152,6 +156,84 @@ def get_map_from_partials(partials, template, data): return mapping +def parse_strings(mapping, data): + """ + Parse strings, notably date and time, from custom format + + The function can do the following operations, in the given order, on string data. + The result of each operation is passed on as input of the next one. + + 1. Extract element from array by index. + 2. Match a regular expression. + 3. Parse date and time using the datetime or dateutil parser. + + The resulting string replaces the mapped value (dictionary) in the mapping dictionary. + If date parsing is enabled, the resulting string is ISO-formatted as required by the Nexus standard. + The operations are selected and tuned by the following dictionary items: + + "parse_string": (required) Data path of the string (array) like for regular datasets. + If this item is missing, string parsing is skipped altogether. + "index": (optional) Element index to extract from string array. + The original data must be a string array. + If this option is not specified, the original data must be a singular string. + "regexp": (optional) Match regular expression, keeping only the matching part. + If the expression contains groups, the result will be a space-delimited concatenation of the matching groups. + If the expression does not contain explicit groups, the whole match is used. + "datetime": (optional) Format string for datetime.datetime.strptime function. + The "datetime" and "dateutil" options are mutually exclusive. + "dateutil": (optional) Date ordering for the dateutil.parser.parse function. + Possible values 'YMD', 'MDY', 'DMY' (or lower case). + The dateutil parsers recognizes many date and time formats, but may need the order of year, month and day. + The "datetime" and "dateutil" options are mutually exclusive. + "timezone": (optional) Specify the time zone if the date-time string does not include a UTC offset. + The time zone must be in a dateutil-supported format, e.g. "Europe/Berlin". + By default, the local time zone is used. + """ + + for key in mapping: + parse_opts = mapping[key] + + try: + value = parse_opts["parse_string"] + if is_path(value): + value = get_val_nested_keystring_from_dict(value[1:], data) + except (KeyError, TypeError): + continue + + if "index" in parse_opts: + value = value[int(parse_opts["index"])] + + if "regexp" in parse_opts: + match = re.match(parse_opts["regexp"], value) + groups = match.groups('') + if groups: + value = " ".join(match.groups("")) + else: + value = match.group(0) + + if "timezone" in parse_opts: + tz = dateutil.tz.gettz(parse_opts["timezone"]) + else: + tz = dateutil.tz.gettz() + + if "datetime" in parse_opts: + dt = datetime.datetime.strptime(value, parse_opts["datetime"]) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=tz) + value = dt.isoformat() + elif "dateutil" in parse_opts: + order = parse_opts["dateutil"].lower() + y = order.index("y") + m = order.index("m") + d = order.index("d") + dt = dateutil.parser.parse(value, yearfirst=y < m, dayfirst=d < m) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=tz) + value = dt.isoformat() + + mapping[key] = value + + class JsonMapReader(BaseReader): """A reader that takes a mapping json file and a data file/object to return a template.""" @@ -217,6 +299,7 @@ def read( ) new_template = Template() + parse_strings(mapping, data) convert_shapes_to_slice_objects(mapping) fill_documented(new_template, mapping, template, data) From 396167b33c6fc2d5083b5173285c9905c0c5a23d Mon Sep 17 00:00:00 2001 From: matthias muntwiler Date: Tue, 20 Feb 2024 14:09:18 +0100 Subject: [PATCH 2/4] Mapping POSIX timestamp in json_map reader --- pynxtools/dataconverter/readers/json_map/reader.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pynxtools/dataconverter/readers/json_map/reader.py b/pynxtools/dataconverter/readers/json_map/reader.py index df4d5c339..be0fb933a 100644 --- a/pynxtools/dataconverter/readers/json_map/reader.py +++ b/pynxtools/dataconverter/readers/json_map/reader.py @@ -185,6 +185,7 @@ def parse_strings(mapping, data): Possible values 'YMD', 'MDY', 'DMY' (or lower case). The dateutil parsers recognizes many date and time formats, but may need the order of year, month and day. The "datetime" and "dateutil" options are mutually exclusive. + "timestamp": (optional) Interpret the data item as POSIX timestamp. "timezone": (optional) Specify the time zone if the date-time string does not include a UTC offset. The time zone must be in a dateutil-supported format, e.g. "Europe/Berlin". By default, the local time zone is used. @@ -230,6 +231,9 @@ def parse_strings(mapping, data): if dt.tzinfo is None: dt = dt.replace(tzinfo=tz) value = dt.isoformat() + elif "timestamp" in parse_opts: + dt = datetime.datetime.fromtimestamp(float(value), tz=tz) + value = dt.isoformat() mapping[key] = value From 92a700d7a588d2c595b0133f78ef54df73e1c0ec Mon Sep 17 00:00:00 2001 From: matthias muntwiler Date: Tue, 20 Feb 2024 14:10:25 +0100 Subject: [PATCH 3/4] Readme for parse_string option of json_map reader --- .../dataconverter/readers/json_map/README.md | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pynxtools/dataconverter/readers/json_map/README.md b/pynxtools/dataconverter/readers/json_map/README.md index b81aec969..7f91bcd3a 100644 --- a/pynxtools/dataconverter/readers/json_map/README.md +++ b/pynxtools/dataconverter/readers/json_map/README.md @@ -59,5 +59,44 @@ Note: This only works for HDF5 files currently. "/ENTRY[entry]/DATA[data]/current_300C": {"link": "current.nxs:/entry/data/current_300C"}, ``` +* Convert custom date and time string to Nexus-compliant ISO format. +The following entry parses the date and time in a string array `/logs/logs` +with items like `22/10/22 15:18:26.0164 - Starting...`. + +```json + "/ENTRY[entry]/end_time": { + "parse_string": "/logs/logs", + "index": "-1", + "regexp": "[0-9.:/]+ [0-9.:/]+", + "dateutil": "dmy", + "timezone": "Europe/Berlin" + } +``` + +The properties correspond to operations that are applied to input data, in the order given below. +The `datetime`, `dateutil` and `timestamp` properties are mutually exclusive. + + "parse_string": (required) Data path of the string (array) like for regular datasets. + "index": (optional) Element index to extract from string array. + The original data must be a string array. + If this option is not specified, the original data must be a singular string. + "regexp": (optional) Match regular expression, keeping only the matching part. + If the expression contains groups, the result will be a space-delimited concatenation of the matching groups. + If the expression does not contain explicit groups, the whole match is used. + "datetime": (optional) Format string for datetime.datetime.strptime function. + If specified, use datetime.datetime.strptime for date parsing. + "dateutil": (optional) Date ordering for the dateutil.parser.parse function. + Possible values 'YMD', 'MDY', 'DMY' (or lower case). + The dateutil parsers recognizes many date and time formats, but may need the order of year, month and day. + If specified, use dateutil.parser.parse for date parsing. + "timestamp": (optional) Interpret the data item as POSIX timestamp. + "timezone": (optional) Specify the time zone if the date-time string does not include a UTC offset. + The time zone must be in a dateutil-supported format, e.g. "Europe/Berlin". + By default, the local time zone is used. + +The resulting string replaces the mapped value (dictionary) in the mapping dictionary. +If date parsing is enabled, the resulting string is ISO-formatted as required by the Nexus standard. + + ## Contact person in FAIRmat for this reader Sherjeel Shabih From 0117ed2818b553239cf1b372862cfdd0d7071005 Mon Sep 17 00:00:00 2001 From: matthias muntwiler Date: Tue, 20 Feb 2024 14:29:34 +0100 Subject: [PATCH 4/4] Evaluate Python expression in json_map reader --- .../dataconverter/readers/json_map/README.md | 21 +++++++++- .../dataconverter/readers/json_map/reader.py | 42 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/pynxtools/dataconverter/readers/json_map/README.md b/pynxtools/dataconverter/readers/json_map/README.md index 7f91bcd3a..1bab35720 100644 --- a/pynxtools/dataconverter/readers/json_map/README.md +++ b/pynxtools/dataconverter/readers/json_map/README.md @@ -33,7 +33,7 @@ This file is designed to let you fill in the requirements of a NeXus Application The mapping files will always be based on the Template the dataconverter generates. See above on how to generate a mapping file. The right hand side values of the Template keys are what you can modify. -Here are the three different ways you can fill the right hand side of the Template keys: +Here are the different ways you can fill the right hand side of the Template keys: * Write the nested path in your datafile. This is indicated by a leading `/` before the word `entry` to make `/entry/data/current_295C` below. Example: @@ -97,6 +97,25 @@ The `datetime`, `dateutil` and `timestamp` properties are mutually exclusive. The resulting string replaces the mapped value (dictionary) in the mapping dictionary. If date parsing is enabled, the resulting string is ISO-formatted as required by the Nexus standard. +* Python expression. +The following entry creates an axis array from scalar values in the input file. + +```json + "/ENTRY[entry]/DATA[image]/angular0": { + "eval": "np.linspace(arg0[0], arg1[0], int(arg2[0]))", + "arg0": "/scan1/attrs/ScientaSliceBegin", + "arg1": "/scan1/attrs/ScientaSliceEnd", + "arg2": "/scan1/attrs/ScientaNumSlices" + }, +``` + +The properties of the mapping declare the expression and its arguments. + + "eval": (required) Python expression to be evaluated by the `eval` built-in. + The expression can use the built-in and numpy (as np) namespaces + as well as the datasets declared by the `argXxx` values. + "argXxx", where Xxx is an integer number: (optional) path of dataset to read from the input data + and to be used in the expression under the same name. ## Contact person in FAIRmat for this reader Sherjeel Shabih diff --git a/pynxtools/dataconverter/readers/json_map/reader.py b/pynxtools/dataconverter/readers/json_map/reader.py index be0fb933a..58815830d 100644 --- a/pynxtools/dataconverter/readers/json_map/reader.py +++ b/pynxtools/dataconverter/readers/json_map/reader.py @@ -238,6 +238,47 @@ def parse_strings(mapping, data): mapping[key] = value +def eval_expressions(mapping, data): + """ + Evaluate Python expressions in mapping. + + If a mapping entry contains a dictionary with a `eval` key, + the `eval` expression is evaluated using the Python built-in `eval`. + The expression can use built-in functions, numpy functions in namespace `np`, + and argXxx variables that are defined in the mapping and can refer to dataset paths. + + The result of the expression replaces the value of the mapping. + + :param mapping: Mapping dictionary + :param data: Data dictionary + :return: None + """ + + for key in mapping: + eval_args = mapping[key] + + try: + expression = eval_args["eval"] + except (KeyError, TypeError): + continue + + args = {} + for arg, value in eval_args.items(): + if arg[0:3] == "arg": + if is_path(value): + value = get_val_nested_keystring_from_dict(value[1:], data) + else: + try: + value = float(value) + except TypeError: + pass + + args[arg] = value + + value = eval(expression, {"np": np}, args) + mapping[key] = value + + class JsonMapReader(BaseReader): """A reader that takes a mapping json file and a data file/object to return a template.""" @@ -304,6 +345,7 @@ def read( new_template = Template() parse_strings(mapping, data) + eval_expressions(mapping, data) convert_shapes_to_slice_objects(mapping) fill_documented(new_template, mapping, template, data)