diff --git a/pynxtools/dataconverter/readers/json_map/README.md b/pynxtools/dataconverter/readers/json_map/README.md index b81aec969..1bab35720 100644 --- a/pynxtools/dataconverter/readers/json_map/README.md +++ b/pynxtools/dataconverter/readers/json_map/README.md @@ -33,7 +33,7 @@ This file is designed to let you fill in the requirements of a NeXus Application The mapping files will always be based on the Template the dataconverter generates. See above on how to generate a mapping file. The right hand side values of the Template keys are what you can modify. -Here are the three different ways you can fill the right hand side of the Template keys: +Here are the different ways you can fill the right hand side of the Template keys: * Write the nested path in your datafile. This is indicated by a leading `/` before the word `entry` to make `/entry/data/current_295C` below. Example: @@ -59,5 +59,63 @@ Note: This only works for HDF5 files currently. "/ENTRY[entry]/DATA[data]/current_300C": {"link": "current.nxs:/entry/data/current_300C"}, ``` +* Convert custom date and time string to Nexus-compliant ISO format. +The following entry parses the date and time in a string array `/logs/logs` +with items like `22/10/22 15:18:26.0164 - Starting...`. + +```json + "/ENTRY[entry]/end_time": { + "parse_string": "/logs/logs", + "index": "-1", + "regexp": "[0-9.:/]+ [0-9.:/]+", + "dateutil": "dmy", + "timezone": "Europe/Berlin" + } +``` + +The properties correspond to operations that are applied to input data, in the order given below. +The `datetime`, `dateutil` and `timestamp` properties are mutually exclusive. + + "parse_string": (required) Data path of the string (array) like for regular datasets. + "index": (optional) Element index to extract from string array. + The original data must be a string array. + If this option is not specified, the original data must be a singular string. + "regexp": (optional) Match regular expression, keeping only the matching part. + If the expression contains groups, the result will be a space-delimited concatenation of the matching groups. + If the expression does not contain explicit groups, the whole match is used. + "datetime": (optional) Format string for datetime.datetime.strptime function. + If specified, use datetime.datetime.strptime for date parsing. + "dateutil": (optional) Date ordering for the dateutil.parser.parse function. + Possible values 'YMD', 'MDY', 'DMY' (or lower case). + The dateutil parsers recognizes many date and time formats, but may need the order of year, month and day. + If specified, use dateutil.parser.parse for date parsing. + "timestamp": (optional) Interpret the data item as POSIX timestamp. + "timezone": (optional) Specify the time zone if the date-time string does not include a UTC offset. + The time zone must be in a dateutil-supported format, e.g. "Europe/Berlin". + By default, the local time zone is used. + +The resulting string replaces the mapped value (dictionary) in the mapping dictionary. +If date parsing is enabled, the resulting string is ISO-formatted as required by the Nexus standard. + +* Python expression. +The following entry creates an axis array from scalar values in the input file. + +```json + "/ENTRY[entry]/DATA[image]/angular0": { + "eval": "np.linspace(arg0[0], arg1[0], int(arg2[0]))", + "arg0": "/scan1/attrs/ScientaSliceBegin", + "arg1": "/scan1/attrs/ScientaSliceEnd", + "arg2": "/scan1/attrs/ScientaNumSlices" + }, +``` + +The properties of the mapping declare the expression and its arguments. + + "eval": (required) Python expression to be evaluated by the `eval` built-in. + The expression can use the built-in and numpy (as np) namespaces + as well as the datasets declared by the `argXxx` values. + "argXxx", where Xxx is an integer number: (optional) path of dataset to read from the input data + and to be used in the expression under the same name. + ## Contact person in FAIRmat for this reader Sherjeel Shabih diff --git a/pynxtools/dataconverter/readers/json_map/reader.py b/pynxtools/dataconverter/readers/json_map/reader.py index 80d97fad3..58815830d 100644 --- a/pynxtools/dataconverter/readers/json_map/reader.py +++ b/pynxtools/dataconverter/readers/json_map/reader.py @@ -17,9 +17,13 @@ # """An example reader implementation for the DataConverter.""" from typing import Tuple, Any +import datetime +import dateutil.parser +import dateutil.tz import json import pickle import numpy as np +import re import xarray from mergedeep import merge @@ -152,6 +156,129 @@ def get_map_from_partials(partials, template, data): return mapping +def parse_strings(mapping, data): + """ + Parse strings, notably date and time, from custom format + + The function can do the following operations, in the given order, on string data. + The result of each operation is passed on as input of the next one. + + 1. Extract element from array by index. + 2. Match a regular expression. + 3. Parse date and time using the datetime or dateutil parser. + + The resulting string replaces the mapped value (dictionary) in the mapping dictionary. + If date parsing is enabled, the resulting string is ISO-formatted as required by the Nexus standard. + The operations are selected and tuned by the following dictionary items: + + "parse_string": (required) Data path of the string (array) like for regular datasets. + If this item is missing, string parsing is skipped altogether. + "index": (optional) Element index to extract from string array. + The original data must be a string array. + If this option is not specified, the original data must be a singular string. + "regexp": (optional) Match regular expression, keeping only the matching part. + If the expression contains groups, the result will be a space-delimited concatenation of the matching groups. + If the expression does not contain explicit groups, the whole match is used. + "datetime": (optional) Format string for datetime.datetime.strptime function. + The "datetime" and "dateutil" options are mutually exclusive. + "dateutil": (optional) Date ordering for the dateutil.parser.parse function. + Possible values 'YMD', 'MDY', 'DMY' (or lower case). + The dateutil parsers recognizes many date and time formats, but may need the order of year, month and day. + The "datetime" and "dateutil" options are mutually exclusive. + "timestamp": (optional) Interpret the data item as POSIX timestamp. + "timezone": (optional) Specify the time zone if the date-time string does not include a UTC offset. + The time zone must be in a dateutil-supported format, e.g. "Europe/Berlin". + By default, the local time zone is used. + """ + + for key in mapping: + parse_opts = mapping[key] + + try: + value = parse_opts["parse_string"] + if is_path(value): + value = get_val_nested_keystring_from_dict(value[1:], data) + except (KeyError, TypeError): + continue + + if "index" in parse_opts: + value = value[int(parse_opts["index"])] + + if "regexp" in parse_opts: + match = re.match(parse_opts["regexp"], value) + groups = match.groups('') + if groups: + value = " ".join(match.groups("")) + else: + value = match.group(0) + + if "timezone" in parse_opts: + tz = dateutil.tz.gettz(parse_opts["timezone"]) + else: + tz = dateutil.tz.gettz() + + if "datetime" in parse_opts: + dt = datetime.datetime.strptime(value, parse_opts["datetime"]) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=tz) + value = dt.isoformat() + elif "dateutil" in parse_opts: + order = parse_opts["dateutil"].lower() + y = order.index("y") + m = order.index("m") + d = order.index("d") + dt = dateutil.parser.parse(value, yearfirst=y < m, dayfirst=d < m) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=tz) + value = dt.isoformat() + elif "timestamp" in parse_opts: + dt = datetime.datetime.fromtimestamp(float(value), tz=tz) + value = dt.isoformat() + + mapping[key] = value + + +def eval_expressions(mapping, data): + """ + Evaluate Python expressions in mapping. + + If a mapping entry contains a dictionary with a `eval` key, + the `eval` expression is evaluated using the Python built-in `eval`. + The expression can use built-in functions, numpy functions in namespace `np`, + and argXxx variables that are defined in the mapping and can refer to dataset paths. + + The result of the expression replaces the value of the mapping. + + :param mapping: Mapping dictionary + :param data: Data dictionary + :return: None + """ + + for key in mapping: + eval_args = mapping[key] + + try: + expression = eval_args["eval"] + except (KeyError, TypeError): + continue + + args = {} + for arg, value in eval_args.items(): + if arg[0:3] == "arg": + if is_path(value): + value = get_val_nested_keystring_from_dict(value[1:], data) + else: + try: + value = float(value) + except TypeError: + pass + + args[arg] = value + + value = eval(expression, {"np": np}, args) + mapping[key] = value + + class JsonMapReader(BaseReader): """A reader that takes a mapping json file and a data file/object to return a template.""" @@ -217,6 +344,8 @@ def read( ) new_template = Template() + parse_strings(mapping, data) + eval_expressions(mapping, data) convert_shapes_to_slice_objects(mapping) fill_documented(new_template, mapping, template, data)