From 28ac8acbe70f094a132c5e56cf2970bb842dcea1 Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Wed, 31 Jul 2024 11:50:58 +0200 Subject: [PATCH 01/25] work in progress before merging main --- resources/config.sample.json | 6 +- src/background_ingestor.py | 225 +++++++++++++++++++++++++++++++++-- src/scicat_configuration.py | 17 +-- src/scicat_ingestor.py | 20 ++-- src/scicat_kafka.py | 36 +++--- src/scicat_path_helpers.py | 36 ++++-- 6 files changed, 272 insertions(+), 68 deletions(-) diff --git a/resources/config.sample.json b/resources/config.sample.json index 80bc88d..cba65ac 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -58,17 +58,13 @@ "retrieve_instrument_from": "default", "instrument_position_in_file_path": 3, "file_handling_options": { - "hdf_structure_in_metadata": false, - "hdf_structure_to_file": true, - "hdf_structure_file_extension": ".hdf_structure.json", - "hdf_structure_output": "SOURCE_FOLDER", "local_output_directory": "data", "compute_file_stats": true, "compute_file_hash": true, "file_hash_algorithm": "blake2b", "save_file_hash": true, "hash_file_extension": "b2b", - "ingestor_files_directory": "ingestor" + "ingestor_files_directory": "../ingestor" }, "dataset_options": { "force_dataset_pid": true, diff --git a/src/background_ingestor.py b/src/background_ingestor.py index b3fd80f..4e20185 100644 --- a/src/background_ingestor.py +++ b/src/background_ingestor.py @@ -1,12 +1,17 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) # import scippnexus as snx +import copy +import datetime +import hashlib import json import logging import pathlib from urllib.parse import urljoin +import os import h5py +import pytz import requests from scicat_configuration import ( BackgroundIngestorConfig, @@ -20,6 +25,7 @@ ) from scicat_logging import build_logger from scicat_metadata import collect_schemas, select_applicable_schema +from src.scicat_path_helpers import compose_ingestor_directory, compose_ingestor_output_file_path from system_helpers import exit_at_exceptions @@ -73,6 +79,142 @@ def extract_variables_values( return values +def _new_hash(algorithm: str) -> Any: + try: + return hashlib.new(algorithm, usedforsecurity=False) + except TypeError: + # Fallback for Python < 3.9 + return hashlib.new(algorithm) + + +def _compute_file_checksum(file_full_path: pathlib.Path, algorithm: str) -> str: + """ + Compute the checksum of a file using specified algorithm. + :param file_full_path: + :param algorithm: + :return: + """ + chk = _new_hash(algorithm) + buffer = memoryview(bytearray(128 * 1024)) + with file_full_path.open("rb", buffering=0) as file: + for n in iter(lambda: file.readinto(buffer), 0): + chk.update(buffer[:n]) + return chk.hexdigest() # type: ignore[no-any-return] + + +def _create_datafiles_entry( + file_full_path: pathlib.Path, + config, + logger +): + """ + Create the matching entry in the datafiles list for the file proovided + :param file_full_path: + :param config: + :param logger: + :return: + """ + logger.info("create_datafiles_entry: adding file {}".format(file_full_path)) + + datafiles_item = { + "path": file_full_path, + "size": 0, + "time": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"), + } + + if config.ingestion_options.compute_files_stats and file_full_path.exists(): + logger.info("create_datafiles_entry: reading file stats from disk") + stats = file_full_path.stat() + datafiles_item = { + **datafiles_item, + **{ + "size": stats.st_size, + "time": datetime.datetime.fromtimestamp(stats.st_ctime, tz=pytz.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + "uid": stats.st_uid, + "gid": stats.st_gid, + "perm": stats.st_mode, + } + } + + return datafiles_item + +def _compute_file_checksum_if_needed( + file_full_path: pathlib.Path, + ingestor_directory: pathlib.Path, + config, + logger +): + checksum = "" + datafiles_item = {} + + if config.ingestion_options.compute_files_hash and os.path.exists(file_full_path): + logger.info("create_datafiles_entry: computing hash of the file from disk") + checksum = _compute_file_checksum(file_full_path, config.ingestion_options.file_hash_algorithm) + + if config.ingstion_options.save_hash_in_file: + + # file path for hash file + hash_file_full_path = compose_ingestor_output_file_path( + ingestor_directory, + file_full_path.stem, + config.ingestion_options.hash_file_extension) + logger.info("create_datafiles_entry: saving hash in file {}".format(hash_file_full_path)) + + # save hash in file + with hash_file_full_path.open('w') as fh: + fh.write(datafiles_item['chk']) + + datafiles_item = _create_datafiles_entry(hash_file_full_path,config,logger) + + return checksum, datafiles_item + + +def _create_datafiles_list( + nexus_file_path: pathlib.Path, + done_writing_message_file_path: pathlib.Path, + ingestor_directory: pathlib.Path, + config, + logger +) -> list: + """ + Update the file size and creation time according to the configuration + :param nexus_file_path: + :param done_writing_message_file_path, + :param config, + :param logger + :return: + """ + + logger.info("create_datafiles_list: adding nexus file {}".format(nexus_file_path)) + datafiles_list = [ + _create_datafiles_entry(nexus_file_path, config, logger) + ] + checksum, datafiles_hash_item = _compute_file_checksum_if_needed( + nexus_file_path, + ingestor_directory, + config, + logger) + if checksum: + datafiles_list[0]['chk'] = checksum + if datafiles_hash_item: + datafiles_list.append(datafiles_hash_item) + + if config.kafka_options.message_saving_options.message_to_file: + logger.info("create_datafiles_list: adding done writing message file {}".format(done_writing_message_file_path)) + datafiles_list.append( + _create_datafiles_entry(done_writing_message_file_path, config, logger) + ) + checksum, datafiles_hash_item = _compute_file_checksum_if_needed( + nexus_file_path, + ingestor_directory, + config, + logger) + if checksum: + datafiles_list[-1]['chk'] = checksum + if datafiles_hash_item: + datafiles_list.append(datafiles_hash_item) + + return datafiles_list def prepare_scicat_dataset(metadata_schema, values): """Prepare scicat dataset as dictionary ready to be ``POST``ed.""" @@ -137,12 +279,43 @@ def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) -> return result -def prepare_scicat_origdatablock(files_list, config): ... def create_scicat_origdatablock( scicat_dataset_pid, nexus_file=None, done_writing_message_file=None ): ... +def _define_dataset_source_folder( + datafiles_list +) -> pathlib.Path: + """ + Return the dataset source folder, which is the common path between all the data files associated with the dataset + """ + return pathlib.Path( os.path.commonpath( [item["path"] for item in datafiles_list])) + + +def _path_to_relative( + datafiles_item: dict, + dataset_source_folder: pathlib.Path +) -> dict: + """ + Copy the datafiles item and transform the path to the relative path to the dataset source folder + """ + origdatablock_datafile_item = copy.deepcopy(datafiles_item) + origdatablock_datafile_item["path"] = str(datafiles_item["path"].to_relative(dataset_source_folder)) + return origdatablock_datafile_item + + +def _prepare_origdatablock_datafiles_list( + datafiles_list: list, + dataset_source_folder: pathlib.Path +) -> list: + """ + Prepare the datafiles list for the origdatablock entry in scicat + That means that the file paths needs to be relative to the dataset source folder + """ + return [_path_to_relative(item,dataset_source_folder) for item in datafiles_list] + + def main() -> None: """Main entry point of the app.""" arg_parser = build_background_ingestor_arg_parser() @@ -162,21 +335,28 @@ def main() -> None: schemas = collect_schemas(ingestion_options.schema_directory) with exit_at_exceptions(logger, daemon=False): + nexus_file_path = pathlib.Path(config.single_run_options.nexus_file) logger.info( "Nexus file to be ingested : %s", - (nexus_file_path := pathlib.Path(config.single_run_options.nexus_file)), - ) - logger.info( - "Done writing message file linked to nexus file : %s", - ( - done_writing_message_file := pathlib.Path( - config.single_run_options.done_writing_message_file - ) - ), + nexus_file_path, ) + done_writing_message_file_path = pathlib.Path() + if config.kafka_options.message_saving_options.message_to_file: + done_writing_message_file_path = pathlib.Path( + config.single_run_options.done_writing_message_file) + logger.info( + "Done writing message file linked to nexus file : %s", + done_writing_message_file_path + ) + + # log done writing message input file + logger.info(json.load(done_writing_message_file_path.open())) - # open and read done writing message input file - logger.info(json.load(done_writing_message_file.open())) + # define which is the directory where the ingestor should save the files it creates, if any is created + ingestor_directory = compose_ingestor_directory( + config.ingestion_options.file_handling_options, + nexus_file_path + ) # open nexus file with h5py with h5py.File(nexus_file_path) as h5file: @@ -207,6 +387,24 @@ def main() -> None: # Collect all data-files and hash-files descriptions _ = [json.dumps(file_dict, indent=2) for file_dict in data_file_list] + # create datafiles list + datafiles_list = _create_datafiles_list( + nexus_file_path, + done_writing_message_file_path, + ingestor_directory, + config, + logger + ) + + dataset_source_folder = _define_dataset_source_folder( + datafiles_list + ) + + origdatablock_datafiles_list = _prepare_origdatablock_datafiles_list( + datafiles_list, + dataset_source_folder + ) + # create and populate scicat dataset entry scicat_dataset = prepare_scicat_dataset(metadata_schema, variables_values) @@ -217,7 +415,8 @@ def main() -> None: # create and populate scicat origdatablock entry # with files and hashes previously computed scicat_origdatablock = create_scicat_origdatablock( - scicat_dataset_pid, nexus_file_path, done_writing_message_file + scicat_dataset_pid, + origdatablock_datafiles_list ) # create origdatablock in scicat diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index 610cfef..652c1cc 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -266,17 +266,13 @@ def from_configurations(cls, config: dict) -> "kafkaOptions": @dataclass class FileHandlingOptions: - hdf_structure_in_metadata: bool = False # Not sure if needed - hdf_structure_to_file: bool = True # Not sure if needed - hdf_structure_file_extension: str = "hdf_structure.json" # Not sure if needed - hdf_structure_output: str = "SOURCE_FOLDER" # Not sure if needed local_output_directory: str = "data" compute_file_stats: bool = True compute_file_hash: bool = True file_hash_algorithm: str = "blake2b" save_file_hash: bool = True hash_file_extension: str = "b2b" - ingestor_files_directory: str = "ingestor" + ingestor_files_directory: str = "../ingestor" @dataclass @@ -311,7 +307,7 @@ def from_configurations(cls, config: dict) -> "IngestionOptions": @dataclass -class IngesterConfig: +class IngestorConfig: original_dict: Mapping """Original configuration dictionary in the json file.""" run_options: RunOptions @@ -327,7 +323,7 @@ def to_dict(self) -> dict: """Return the configuration as a dictionary.""" return asdict( - IngesterConfig( + IngestorConfig( _recursive_deepcopy( self.original_dict ), # asdict does not support MappingProxyType @@ -339,13 +335,13 @@ def to_dict(self) -> dict: ) -def build_scicat_ingester_config(input_args: argparse.Namespace) -> IngesterConfig: +def build_scicat_ingestor_config(input_args: argparse.Namespace) -> IngestorConfig: """Merge configuration from the configuration file and input arguments.""" config_dict = _load_config(input_args.config_file) run_option_dict = _merge_run_options(config_dict, vars(input_args)) # Wrap configuration in a dataclass - return IngesterConfig( + return IngestorConfig( original_dict=_freeze_dict_items(config_dict), run_options=RunOptions(**run_option_dict), kafka_options=kafkaOptions.from_configurations( @@ -365,9 +361,8 @@ class SingleRunOptions: done_writing_message_file: str """Full path of the done writing message file that match the ``nexus_file``.""" - @dataclass -class BackgroundIngestorConfig(IngesterConfig): +class BackgroundIngestorConfig(IngestorConfig): single_run_options: SingleRunOptions """Single run configuration options for background ingestor.""" diff --git a/src/scicat_ingestor.py b/src/scicat_ingestor.py index b42fdb2..ea6a7d1 100644 --- a/src/scicat_ingestor.py +++ b/src/scicat_ingestor.py @@ -16,17 +16,16 @@ from scicat_configuration import ( MessageSavingOptions, build_main_arg_parser, - build_scicat_ingester_config, + build_scicat_ingestor_config, ) from scicat_kafka import ( WritingFinished, build_consumer, - compose_message_path, save_message_to_file, wrdn_messages, ) from scicat_logging import build_logger -from scicat_path_helpers import select_target_directory +from scicat_path_helpers import compose_ingestor_output_file_path, compose_ingestor_directory from system_helpers import exit_at_exceptions @@ -57,7 +56,7 @@ def main() -> None: """Main entry point of the app.""" arg_parser = build_main_arg_parser() arg_namespace = arg_parser.parse_args() - config = build_scicat_ingester_config(arg_namespace) + config = build_scicat_ingestor_config(arg_namespace) logger = build_logger(config) # Log the configuration as dictionary so that it is easier to read from the logs @@ -81,17 +80,18 @@ def main() -> None: if message: # Extract nexus file path from the message. nexus_file_path = pathlib.Path(message.file_name) - file_saving_dir = select_target_directory( - config.ingestion_options.file_handling_options, nexus_file_path + ingestor_directory = compose_ingestor_directory( + config.ingestion_options.file_handling_options, + nexus_file_path ) dump_message_to_file_if_needed( logger=logger, message_saving_options=message_saving_options, message=message, - message_file_path=compose_message_path( - target_dir=file_saving_dir, - nexus_file_path=nexus_file_path, - message_saving_options=message_saving_options, + message_file_path=compose_ingestor_output_file_path( + ingestor_directory=ingestor_directory, + file_name=nexus_file_path.stem, + file_extension=message_saving_options.message_file_extension, ), ) # instantiate a new process and runs background ingestor diff --git a/src/scicat_kafka.py b/src/scicat_kafka.py index 8ed99e5..b09454f 100644 --- a/src/scicat_kafka.py +++ b/src/scicat_kafka.py @@ -136,24 +136,24 @@ def wrdn_messages( yield None -def compose_message_path( - *, - target_dir: pathlib.Path, - nexus_file_path: pathlib.Path, - message_saving_options: MessageSavingOptions, -) -> pathlib.Path: - """Compose the message path based on the nexus file path and configuration.""" - - return target_dir / ( - pathlib.Path( - ".".join( - ( - nexus_file_path.stem, - message_saving_options.message_file_extension.removeprefix("."), - ) - ) - ) - ) +# def compose_message_path( +# *, +# target_dir: pathlib.Path, +# nexus_file_path: pathlib.Path, +# message_saving_options: MessageSavingOptions, +# ) -> pathlib.Path: +# """Compose the message path based on the nexus file path and configuration.""" +# +# return target_dir / ( +# pathlib.Path( +# ".".join( +# ( +# nexus_file_path.stem, +# message_saving_options.message_file_extension.removeprefix("."), +# ) +# ) +# ) +# ) def save_message_to_file( diff --git a/src/scicat_path_helpers.py b/src/scicat_path_helpers.py index 5a164ca..3635323 100644 --- a/src/scicat_path_helpers.py +++ b/src/scicat_path_helpers.py @@ -5,20 +5,34 @@ from scicat_configuration import FileHandlingOptions -def select_target_directory( - fh_options: FileHandlingOptions, file_path: pathlib.Path +def compose_ingestor_directory( + fh_options: FileHandlingOptions, + nexus_file_path: str | pathlib.Path ) -> pathlib.Path: - """Select the target directory based on the file path and the options.""" - if fh_options.hdf_structure_output == "SOURCE_FOLDER": - return file_path.parent / pathlib.Path(fh_options.ingestor_files_directory) + """Select the ingestor directory based on the file path and the options.""" + directory = pathlib.Path(fh_options.ingestor_files_directory) + nexus_file_path = pathlib.Path(nexus_file_path) if isinstance(nexus_file_path,str) else nexus_file_path + if directory.is_absolute(): + return directory else: - return pathlib.Path(fh_options.local_output_directory) + directory = nexus_file_path.parents[0] / directory + return directory.resolve() -def compose_checksum_file_path( - fh_options: FileHandlingOptions, file_path: pathlib.Path +def compose_ingestor_output_file_path( + ingestor_directory: pathlib.Path, + file_name: str, + file_extension: str, ) -> pathlib.Path: - """Compose the path for the checksum file.""" - return pathlib.Path(fh_options.ingestor_files_directory) / pathlib.Path( - file_path.name + fh_options.hash_file_extension + """Compose the ingestor output file path based on the input provided.""" + + return ingestor_directory / ( + pathlib.Path( + ".".join( + ( + file_name, + file_extension, + ) + ) + ) ) From 98261994e08843e46b9415ba4f60d40305850a91 Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Wed, 31 Jul 2024 17:27:13 +0200 Subject: [PATCH 02/25] WIP: fixed data structure. renamed different functions and data structure. Removed unneeded configuration --- resources/config.sample.json | 110 ++++---- src/scicat_configuration.py | 248 +++++++++--------- src/scicat_dataset.py | 11 +- ...ingestor.py => scicat_offline_ingestor.py} | 114 ++++---- ..._ingestor.py => scicat_online_ingestor.py} | 0 src/scicat_schemas/__init__.py | 2 +- .../datafilelist_item.schema.json.jinja | 9 + src/scicat_schemas/dataset.schema.json.jinja | 38 +-- src/scicat_schemas/load_template.py | 12 +- .../origdatablock.schema.json.jinja | 8 +- src/scicat_schemas/single_datafile.json.jinja | 9 - 11 files changed, 280 insertions(+), 281 deletions(-) rename src/{background_ingestor.py => scicat_offline_ingestor.py} (81%) rename src/{scicat_ingestor.py => scicat_online_ingestor.py} (100%) create mode 100644 src/scicat_schemas/datafilelist_item.schema.json.jinja delete mode 100644 src/scicat_schemas/single_datafile.json.jinja diff --git a/resources/config.sample.json b/resources/config.sample.json index cba65ac..be44f04 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -1,77 +1,71 @@ { + "config_file": "config.json", + "id": "", + "dataset": { + "check_by_job_id": true, + "use_job_id_as_dataset_id": true, + "generate_dataset_pid": true, + "dataset_pid_prefix": "20.500.12269", + "default_instrument_id": "ID_OF_FALLBACK_INSTRUMENT", + "default_instrument_name": "FALLBACK_INSTRUMENT_NAME", + "default_proposal_id": "DEFAULT_PROPOSAL_ID", + "default_ownerGroup": "DEFAULT_OWNER_GROUP", + "default_accessGroups": [ + "ACCESS_GROUP_1" + ] + }, + "ingestion": { + "dry_run": false, + "schemas_directory": "schemas", + "file_handling": { + "compute_file_stats": true, + "compute_file_hash": true, + "file_hash_algorithm": "blake2b", + "save_file_hash": true, + "hash_file_extension": "b2b", + "ingestor_files_directory": "../ingestor", + "message_to_file": true, + "message_file_extension": "message.json" + } + }, "kafka": { - "topics": ["KAFKA_TOPIC_1", "KAFKA_TOPIC_2"], + "topics": [ + "KAFKA_TOPIC_1", + "KAFKA_TOPIC_2" + ], "group_id": "GROUP_ID", - "bootstrap_servers": ["localhost:9093"], + "bootstrap_servers": [ + "localhost:9093" + ], "sasl_mechanism": "SCRAM-SHA-256", "sasl_username": "USERNAME", "sasl_password": "PASSWORD", "ssl_ca_location": "FULL_PATH_TO_CERTIFICATE_FILE", "individual_message_commit": true, "enable_auto_commit": true, - "auto_offset_reset": "earliest", - "message_saving_options": { - "message_to_file": true, - "message_file_extension": "message.json", - "message_output": "SOURCE_FOLDER" - } - }, - "user_office": { - "host": "https://useroffice.host", - "username": "USERNAME", - "password": "PASSWORD", - "token": "JWT_TOKEN" - }, - "scicat": { - "host": "https://scicat.host", - "username": "USERNAME", - "password": "PASSWORD", - "token": "JWT_TOKEN" - }, - "graylog": {"host": "", "port": "", "facility": "scicat.ingestor"}, - "dataset": { - "instrument_id": "ID_OF_FALLBACK_INSTRUMENT", - "instrument": "FALLBACK_INSTRUMENT_NAME", - "default_proposal_id": "DEFAULT_PROPOSAL_ID", - "ownable": { - "ownerGroup": "DEFAULT_OWNER_GROUP", - "accessGroups": ["ACCESS_GROUP_1"] - } + "auto_offset_reset": "earliest" }, - "options": { - "config_file": "config.json", + "logging": { "verbose": false, "file_log": false, "file_log_base_name": "scicat_ingestor_log", "file_log_timestamp": false, "logging_level": "INFO", + "log_message_prefix": "SFI", "system_log": false, "system_log_facility": "mail", - "log_message_prefix": "SFI", - "check_by_job_id": true, - "pyscicat": null, - "graylog": false - }, - "ingestion_options": { - "dry_run": false, - "schemas_directory": "schemas", - "retrieve_instrument_from": "default", - "instrument_position_in_file_path": 3, - "file_handling_options": { - "local_output_directory": "data", - "compute_file_stats": true, - "compute_file_hash": true, - "file_hash_algorithm": "blake2b", - "save_file_hash": true, - "hash_file_extension": "b2b", - "ingestor_files_directory": "../ingestor" - }, - "dataset_options": { - "force_dataset_pid": true, - "dataset_pid_prefix": "20.500.12269", - "use_job_id_as_dataset_id": true, - "beautify_metadata_keys": false, - "metadata_levels_separator": " " - } + "graylog": false, + "graylog_host": "", + "graylog_port": "", + "graylog_facility": "scicat.ingestor" + }, + "scicat": { + "host": "https://scicat.host", + "token": "JWT_TOKEN", + "headers": {}, + "timeout": 0, + "stream": true, + "verify": false } } + diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index 652c1cc..4a6132e 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -2,7 +2,7 @@ # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) import argparse from collections.abc import Mapping -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, field from types import MappingProxyType from typing import Any @@ -20,12 +20,19 @@ def _load_config(config_file: Any) -> dict: return {} -def _merge_run_options(config_dict: dict, input_args_dict: dict) -> dict: +def _merge_config_options( + config_dict: dict, + input_args_dict: dict, + keys: list[str] | None = None +) -> dict: """Merge configuration from the configuration file and input arguments.""" + if keys == None: + keys = config_dict.keys(); + return { **config_dict.setdefault("options", {}), - **{key: value for key, value in input_args_dict.items() if value is not None}, + **{key: input_args_dict[key] for key in keys if input_args_dict[key] is not None}, } @@ -52,7 +59,7 @@ def _recursive_deepcopy(obj: Any) -> dict: return copied -def build_main_arg_parser() -> argparse.ArgumentParser: +def build_online_arg_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser() group = parser.add_argument_group("Scicat Ingestor Options") @@ -67,6 +74,13 @@ def build_main_arg_parser() -> argparse.ArgumentParser: help="Configuration file name. Default: config.20240405.json", type=str, ) + group.add_argument( + "-d", "--dry-run", + dest="dry_run", + help="Dry run. Does not produce any output file nor modify entry in SciCat", + action="store_true", + default=False, + ) group.add_argument( "-v", "--verbose", @@ -128,13 +142,6 @@ def build_main_arg_parser() -> argparse.ArgumentParser: action="store_true", default=True, ) - group.add_argument( - "--pyscicat", - dest="pyscicat", - help="Location where a specific version of pyscicat is available", - default=None, - type=str, - ) group.add_argument( "--graylog", dest="graylog", @@ -145,9 +152,9 @@ def build_main_arg_parser() -> argparse.ArgumentParser: return parser -def build_background_ingestor_arg_parser() -> argparse.ArgumentParser: - parser = build_main_arg_parser() - group = parser.add_argument_group('Scicat Background Ingestor Options') +def build_offline_ingestor_arg_parser() -> argparse.ArgumentParser: + parser = build_online_arg_parser() + group = parser.add_argument_group('Scicat Offline Ingestor Options') group.add_argument( '-f', @@ -178,51 +185,33 @@ def build_background_ingestor_arg_parser() -> argparse.ArgumentParser: @dataclass -class GraylogOptions: - host: str = "" - port: str = "" - facility: str = "scicat.ingestor" - - -@dataclass -class RunOptions: - """RunOptions dataclass to store the configuration options. +class LoggingOptions: + """ + LoggingOptions dataclass to store the configuration options. Most of options don't have default values because they are expected to be set by the user either in the configuration file or through command line arguments. """ - config_file: str verbose: bool file_log: bool file_log_base_name: str file_log_timestamp: bool - system_log: bool - log_message_prefix: str logging_level: str - check_by_job_id: bool + log_message_prefix: str + system_log: bool system_log_facility: str | None = None - pyscicat: str | None = None graylog: bool = False - - -@dataclass(frozen=True) -class MessageSavingOptions: - message_to_file: bool = True - """Save messages to a file.""" - message_file_extension: str = "message.json" - """Message file extension.""" - message_output: str = "SOURCE_FOLDER" - """Output directory for messages.""" - - -DEFAULT_MESSAGE_SAVING_OPTIONS = MessageSavingOptions() + graylog_host: str = "" + graylog_port: str = "" + graylog_facility: str = "scicat.ingestor" @dataclass -class kafkaOptions: - """KafkaOptions dataclass to store the configuration options. +class KafkaOptions: + """ + KafkaOptions dataclass to store the configuration options. Default values are provided as they are not expected to be set by command line arguments. @@ -248,161 +237,170 @@ class kafkaOptions: """Enable Kafka auto commit.""" auto_offset_reset: str = "earliest" """Kafka auto offset reset.""" - message_saving_options: MessageSavingOptions = DEFAULT_MESSAGE_SAVING_OPTIONS - """Message saving options.""" @classmethod - def from_configurations(cls, config: dict) -> "kafkaOptions": + def from_configurations(cls, config: dict) -> "KafkaOptions": """Create kafkaOptions from a dictionary.""" - return cls( - **{ - **config, - "message_saving_options": MessageSavingOptions( - **config.get("message_saving_options", {}) - ), - }, - ) + return cls(**config) @dataclass class FileHandlingOptions: - local_output_directory: str = "data" - compute_file_stats: bool = True - compute_file_hash: bool = True + compute_file_stats: bool = False + compute_file_hash: bool = False file_hash_algorithm: str = "blake2b" - save_file_hash: bool = True + save_file_hash: bool = False hash_file_extension: str = "b2b" ingestor_files_directory: str = "../ingestor" - - -@dataclass -class DatasetOptions: - force_dataset_pid: bool = True # Not sure if needed - dataset_pid_prefix: str = "20.500.12269" - use_job_id_as_dataset_id: bool = True - beautify_metadata_keys: bool = False - metadata_levels_separator: str = " " - + message_to_file: bool = True + message_file_extension: str = "message.json" @dataclass class IngestionOptions: - file_handling_options: FileHandlingOptions - dataset_options: DatasetOptions - schema_directory: str = "schemas" - retrieve_instrument_from: str = "default" - instrument_position_in_file_path: int = 3 + file_handling: FileHandlingOptions + dry_run: bool = False + schemas_directory: str = "schemas" @classmethod def from_configurations(cls, config: dict) -> "IngestionOptions": """Create IngestionOptions from a dictionary.""" return cls( FileHandlingOptions(**config.get("file_handling_options", {})), - DatasetOptions(**config.get("dataset_options", {})), - schema_directory=config.get("schema_directory", "schemas"), - retrieve_instrument_from=config.get("retrieve_instrument_from", "default"), - instrument_position_in_file_path=config.get( - "instrument_position_in_file_path", 3 - ), + dry_run=config.get("dry_run", False), + schemas_directory=config.get("schemas_directory", "schemas"), ) @dataclass -class IngestorConfig: +class DatasetOptions: + check_by_job_id: bool = True, + allow_dataset_pid: bool = True, + dataset_pid_prefix: str = "20.500.12269", + default_instrument_id: str = "", + default_instrument_name: str = "", + default_proposal_id: str = "", + default_ownerGroup: str = "", + default_accessGroups: list[str] = field(default_factory=list) + + @classmethod + def from_configurations(cls, config: dict) -> "DatasetOptions": + """Create DatasetOptions from a dictionary.""" + return cls(**config) + + +@dataclass +class SciCatOptions: + host: str = "" + token: str = "" + headers: dict = field(default_factory=dict) + timeout: int = 0 + stream: bool = True + verify: bool = False + + @classmethod + def from_configurations(cls, config: dict) -> "SciCatOptions": + """Create SciCatOptions from a dictionary.""" + options = cls(**config) + options.headers = { + "Authorization": "Bearer {}".format(options.token) + } + return options + + +@dataclass +class OnlineIngestorConfig: original_dict: Mapping """Original configuration dictionary in the json file.""" - run_options: RunOptions - """Merged configuration dictionary with command line arguments.""" - kafka_options: kafkaOptions - """Kafka configuration options read from files.""" - graylog_options: GraylogOptions - """Graylog configuration options for streaming logs.""" - ingestion_options: IngestionOptions - """Ingestion configuration options for background ingestor.""" + dataset: DatasetOptions + kafka: KafkaOptions + logging: LoggingOptions + ingestion: IngestionOptions + scicat: SciCatOptions def to_dict(self) -> dict: """Return the configuration as a dictionary.""" return asdict( - IngestorConfig( + OnlineIngestorConfig( _recursive_deepcopy( self.original_dict ), # asdict does not support MappingProxyType - self.run_options, - self.kafka_options, - self.graylog_options, - self.ingestion_options, + self.dataset, + self.kafka, + self.logging, + self.ingestion, + self.scicat, ) ) -def build_scicat_ingestor_config(input_args: argparse.Namespace) -> IngestorConfig: +def build_scicat_online_ingestor_config(input_args: argparse.Namespace) -> OnlineIngestorConfig: """Merge configuration from the configuration file and input arguments.""" config_dict = _load_config(input_args.config_file) - run_option_dict = _merge_run_options(config_dict, vars(input_args)) + logging_dict = _merge_config_options(config_dict.setdefault("logging",{}), vars(input_args)) + ingestion_dict = _merge_config_options(config_dict.setdefault("ingestion",{}), vars(input_args), ["dry-run"]) # Wrap configuration in a dataclass - return IngestorConfig( + return OnlineIngestorConfig( original_dict=_freeze_dict_items(config_dict), - run_options=RunOptions(**run_option_dict), - kafka_options=kafkaOptions.from_configurations( - config_dict.setdefault("kafka", {}) - ), - graylog_options=GraylogOptions(**config_dict.setdefault("graylog", {})), - ingestion_options=IngestionOptions.from_configurations( - config_dict.setdefault("ingestion_options", {}) - ), + dataset=DatasetOptions(**config_dict.setdefault("dataset",{})), + ingestion=IngestionOptions.from_configurations(ingestion_dict), + kafka=KafkaOptions(**config_dict.setdefault("kafka", {})), + logging=LoggingOptions(**logging_dict), + scicat=SciCatOptions(**config_dict.setdefault("scicat", {})), ) @dataclass -class SingleRunOptions: +class OfflineRunOptions: nexus_file: str """Full path of the input nexus file to be ingested.""" done_writing_message_file: str """Full path of the done writing message file that match the ``nexus_file``.""" @dataclass -class BackgroundIngestorConfig(IngestorConfig): - single_run_options: SingleRunOptions +class OfflineIngestorConfig(OnlineIngestorConfig): + offline_run: OfflineRunOptions """Single run configuration options for background ingestor.""" def to_dict(self) -> dict: """Return the configuration as a dictionary.""" return asdict( - BackgroundIngestorConfig( + OfflineIngestorConfig( _recursive_deepcopy( self.original_dict ), # asdict does not support MappingProxyType - self.run_options, - self.kafka_options, - self.graylog_options, - self.ingestion_options, - self.single_run_options, + self.dataset, + self.kafka, + self.logging, + self.ingestion, + self.scicat, + self.offline_run, ) ) -def build_scicat_background_ingester_config( +def build_scicat_offline_ingestor_config( input_args: argparse.Namespace, -) -> BackgroundIngestorConfig: +) -> OfflineIngestorConfig: """Merge configuration from the configuration file and input arguments.""" config_dict = _load_config(input_args.config_file) input_args_dict = vars(input_args) - single_run_option_dict = { + logging_dict = _merge_config_options(config_dict.setdefault("logging",{}), input_args_dict) + ingestion_dict = _merge_config_options(config_dict.setdefault("ingestion",{}), input_args_dict, ["dry-run"]) + offline_run_option_dict = { "nexus_file": input_args_dict.pop("nexus_file"), "done_writing_message_file": input_args_dict.pop("done_writing_message_file"), } - run_option_dict = _merge_run_options(config_dict, input_args_dict) - ingestion_option_dict = config_dict.setdefault("ingestion_options", {}) - kafka_option_dict = config_dict.setdefault("kafka", {}) # Wrap configuration in a dataclass - return BackgroundIngestorConfig( + return OfflineIngestorConfig( original_dict=_freeze_dict_items(config_dict), - run_options=RunOptions(**run_option_dict), - kafka_options=kafkaOptions.from_configurations(kafka_option_dict), - single_run_options=SingleRunOptions(**single_run_option_dict), - graylog_options=GraylogOptions(**config_dict.setdefault("graylog", {})), - ingestion_options=IngestionOptions.from_configurations(ingestion_option_dict), + dataset=DatasetOptions(**config_dict.setdefault("dataset",{})), + ingestion=IngestionOptions.from_configurations(ingestion_dict), + kafka=KafkaOptions(**config_dict.setdefault("kafka", {})), + logging=LoggingOptions(**logging_dict), + scicat=SciCatOptions(**config_dict.setdefault("scicat", {})), + offline_run=OfflineRunOptions(**offline_run_option_dict), ) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index 9800b40..ad2acb1 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -9,7 +9,7 @@ from scicat_schemas import ( load_dataset_schema_template, load_origdatablock_schema_template, - load_single_datafile_template, + load_datafilelist_item_schema_template, ) @@ -164,12 +164,9 @@ def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str: def build_single_data_file_desc( file_path: pathlib.Path, config: FileHandlingOptions ) -> dict[str, Any]: - """Build the description of a single data file.""" - import datetime - import json - - from scicat_schemas import load_single_datafile_template - + """ + Build the description of a single data file. + """ single_file_template = load_single_datafile_template() return json.loads( diff --git a/src/background_ingestor.py b/src/scicat_offline_ingestor.py similarity index 81% rename from src/background_ingestor.py rename to src/scicat_offline_ingestor.py index 4e20185..9f1d50a 100644 --- a/src/background_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -102,31 +102,31 @@ def _compute_file_checksum(file_full_path: pathlib.Path, algorithm: str) -> str: return chk.hexdigest() # type: ignore[no-any-return] -def _create_datafiles_entry( +def _create_datafilelist_item( file_full_path: pathlib.Path, config, logger ): """ - Create the matching entry in the datafiles list for the file proovided + Create the matching entry in the datafiles list for the file provided :param file_full_path: :param config: :param logger: :return: """ - logger.info("create_datafiles_entry: adding file {}".format(file_full_path)) + logger.info("create_datafilelist_item: adding file {}".format(file_full_path)) - datafiles_item = { + datafilelist_item = { "path": file_full_path, "size": 0, "time": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"), } if config.ingestion_options.compute_files_stats and file_full_path.exists(): - logger.info("create_datafiles_entry: reading file stats from disk") + logger.info("create_datafilelist_item: reading file stats from disk") stats = file_full_path.stat() datafiles_item = { - **datafiles_item, + **datafilelist_item, **{ "size": stats.st_size, "time": datetime.datetime.fromtimestamp(stats.st_ctime, tz=pytz.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z"), @@ -136,7 +136,7 @@ def _create_datafiles_entry( } } - return datafiles_item + return datafilelist_item def _compute_file_checksum_if_needed( file_full_path: pathlib.Path, @@ -164,7 +164,7 @@ def _compute_file_checksum_if_needed( with hash_file_full_path.open('w') as fh: fh.write(datafiles_item['chk']) - datafiles_item = _create_datafiles_entry(hash_file_full_path,config,logger) + datafiles_item = _create_datafilelist_item(hash_file_full_path,config,logger) return checksum, datafiles_item @@ -187,7 +187,7 @@ def _create_datafiles_list( logger.info("create_datafiles_list: adding nexus file {}".format(nexus_file_path)) datafiles_list = [ - _create_datafiles_entry(nexus_file_path, config, logger) + _create_datafilelist_item(nexus_file_path, config, logger) ] checksum, datafiles_hash_item = _compute_file_checksum_if_needed( nexus_file_path, @@ -202,7 +202,7 @@ def _create_datafiles_list( if config.kafka_options.message_saving_options.message_to_file: logger.info("create_datafiles_list: adding done writing message file {}".format(done_writing_message_file_path)) datafiles_list.append( - _create_datafiles_entry(done_writing_message_file_path, config, logger) + _create_datafilelist_item(done_writing_message_file_path, config, logger) ) checksum, datafiles_hash_item = _compute_file_checksum_if_needed( nexus_file_path, @@ -216,10 +216,13 @@ def _create_datafiles_list( return datafiles_list -def prepare_scicat_dataset(metadata_schema, values): - """Prepare scicat dataset as dictionary ready to be ``POST``ed.""" +def _prepare_scicat_dataset(metadata_schema, values): + """ + Prepare scicat dataset as dictionary ready to be ``POST``ed. + """ schema: dict = metadata_schema["schema"] dataset = {} + scientific_metadata = { 'ingestor_metadata_schema_id': { "value": metadata_schema["id"], @@ -254,10 +257,15 @@ def prepare_scicat_dataset(metadata_schema, values): return dataset -def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) -> dict: +def _create_scicat_dataset( + dataset: dict, + config: dict, + logger: logging.Logger +) -> dict: """ Execute a POST request to scicat to create a dataset """ + logger.info("create_scicat_dataset: Sending POST request to create new dataset") response = requests.request( method="POST", url=urljoin(config["scicat_url"], "datasets"), @@ -269,43 +277,42 @@ def create_scicat_dataset(dataset: str, config: dict, logger: logging.Logger) -> ) result = response.json() - if response.ok: - ... - else: + if not response.ok: err = result.get("error", {}) + logger.info(f"create_scicat_dataset: Failed to create new dataset. Error {err}") raise Exception(f"Error creating new dataset: {err}") - logger.info("Dataset create successfully. Dataset pid: %s", result['pid']) + logger.info("create_scicat_dataset: Dataset create successfully. Dataset pid: %s", result['pid']) return result -def create_scicat_origdatablock( +def _create_scicat_origdatablock( scicat_dataset_pid, nexus_file=None, done_writing_message_file=None ): ... def _define_dataset_source_folder( - datafiles_list + datafilelist ) -> pathlib.Path: """ Return the dataset source folder, which is the common path between all the data files associated with the dataset """ - return pathlib.Path( os.path.commonpath( [item["path"] for item in datafiles_list])) + return pathlib.Path( os.path.commonpath( [item["path"] for item in datafilelist])) def _path_to_relative( - datafiles_item: dict, + datafilelist_item: dict, dataset_source_folder: pathlib.Path ) -> dict: """ Copy the datafiles item and transform the path to the relative path to the dataset source folder """ - origdatablock_datafile_item = copy.deepcopy(datafiles_item) - origdatablock_datafile_item["path"] = str(datafiles_item["path"].to_relative(dataset_source_folder)) - return origdatablock_datafile_item + origdatablock_datafilelist_item = copy.deepcopy(datafilelist_item) + origdatablock_datafilelist_item["path"] = str(datafilelist_item["path"].to_relative(dataset_source_folder)) + return origdatablock_datafilelist_item -def _prepare_origdatablock_datafiles_list( +def _prepare_origdatablock_datafilelist( datafiles_list: list, dataset_source_folder: pathlib.Path ) -> list: @@ -368,27 +375,30 @@ def main() -> None: metadata_schema['variables'], h5file, config ) - # Collect data-file descriptions - data_file_list = [ - build_single_data_file_desc(nexus_file_path, file_handling_options), - build_single_data_file_desc( - done_writing_message_file, file_handling_options - ), - # TODO: Add nexus structure file - ] - # Create hash of all the files if needed - if file_handling_options.save_file_hash: - data_file_list += [ - save_and_build_single_hash_file_desc( - data_file_dict, file_handling_options - ) - for data_file_dict in data_file_list - ] - # Collect all data-files and hash-files descriptions - _ = [json.dumps(file_dict, indent=2) for file_dict in data_file_list] - - # create datafiles list - datafiles_list = _create_datafiles_list( + # ============================================= + # I'm not sure that using jinja templates is the right thing to do + # ============================================= + # # Collect data-file descriptions + # data_file_list = [ + # build_single_data_file_desc(nexus_file_path, file_handling_options), + # build_single_data_file_desc( + # done_writing_message_file, file_handling_options + # ), + # # TODO: Add nexus structure file + # ] + # # Create hash of all the files if needed + # if file_handling_options.save_file_hash: + # data_file_list += [ + # save_and_build_single_hash_file_desc( + # data_file_dict, file_handling_options + # ) + # for data_file_dict in data_file_list + # ] + # # Collect all data-files and hash-files descriptions + # _ = [json.dumps(file_dict, indent=2) for file_dict in data_file_list] + + # create datafilelist + datafilelist = _create_datafiles_list( nexus_file_path, done_writing_message_file_path, ingestor_directory, @@ -397,24 +407,24 @@ def main() -> None: ) dataset_source_folder = _define_dataset_source_folder( - datafiles_list + datafilelist ) - origdatablock_datafiles_list = _prepare_origdatablock_datafiles_list( - datafiles_list, + origdatablock_datafiles_list = _prepare_origdatablock_datafilelist( + datafilelist, dataset_source_folder ) # create and populate scicat dataset entry - scicat_dataset = prepare_scicat_dataset(metadata_schema, variables_values) + scicat_dataset = _prepare_scicat_dataset(metadata_schema, variables_values) # create dataset in scicat - scicat_dataset = create_scicat_dataset(scicat_dataset, config) + scicat_dataset = _create_scicat_dataset(scicat_dataset, config) scicat_dataset_pid = scicat_dataset["pid"] # create and populate scicat origdatablock entry # with files and hashes previously computed - scicat_origdatablock = create_scicat_origdatablock( + scicat_origdatablock = _create_scicat_origdatablock( scicat_dataset_pid, origdatablock_datafiles_list ) diff --git a/src/scicat_ingestor.py b/src/scicat_online_ingestor.py similarity index 100% rename from src/scicat_ingestor.py rename to src/scicat_online_ingestor.py diff --git a/src/scicat_schemas/__init__.py b/src/scicat_schemas/__init__.py index cf1e37f..c9c5e87 100644 --- a/src/scicat_schemas/__init__.py +++ b/src/scicat_schemas/__init__.py @@ -3,7 +3,7 @@ # ruff: noqa: F401 from .load_template import ( - load_single_datafile_template, + load_datafilelist_item_schema_template, load_dataset_schema_template, load_origdatablock_schema_template, ) diff --git a/src/scicat_schemas/datafilelist_item.schema.json.jinja b/src/scicat_schemas/datafilelist_item.schema.json.jinja new file mode 100644 index 0000000..5c0cef6 --- /dev/null +++ b/src/scicat_schemas/datafilelist_item.schema.json.jinja @@ -0,0 +1,9 @@ +{ + "path": "{{ path }}", + "size": {{ size }}, + "time": "{{ time }}", + {% if chk %}"chk": "{{ chk }}",{% endif %} + {% if uid %}"uid": "{{ uid }}",{% endif %} + {% if gid %}"gid": "{{ gid }}",{% endif %} + {% if perm %}"perm": "{{ perm }}"{% endif %} +} diff --git a/src/scicat_schemas/dataset.schema.json.jinja b/src/scicat_schemas/dataset.schema.json.jinja index bf24528..ee8da7c 100644 --- a/src/scicat_schemas/dataset.schema.json.jinja +++ b/src/scicat_schemas/dataset.schema.json.jinja @@ -1,30 +1,30 @@ { - "pid": "{{ dataset_pid }}", - "datasetName": "{{ dataset_name }}", - "description": "{{ dataset_description }}", - "principalInvestigator": "{{ principal_investigator }}", - "creationLocation": "{{ facility }}:{{ environment }}", + "pid": "{{ pid }}", + "datasetName": "{{ datasetName }}", + "description": "{{ datasetDescription }}", + "principalInvestigator": "{{ principalInvestigator }}", + "creationLocation": "{{ creationLocation }}", "scientificMetadata": { - {{ scientific_metadata }} + {{ scientificMetadata }} }, "owner": "{{ owner }}", - "ownerEmail": "{{ owner_email }}", - "sourceFolder": "{{ source_folder }}", - "contactEmail": "{{ contact_email }}", - "creationTime": "{{ iso_creation_time }}", + "ownerEmail": "{{ ownerEmail }}", + "sourceFolder": "{{ sourceFolder }}", + "contactEmail": "{{ contactEmail }}", + "creationTime": "{{ creationTime }}", "type": "raw", + {% if techniques }{% for technique in techniques } "techniques": [ { - "pid": "{{ technique_pid }}", - "names": "{{ technique_name }}" + "pid": "{{ technique.pid }}", + "names": "{{ technique.name }}" } - ], - "instrumentId": "{{ instrument_id }}", - "sampleId": "{{ sample_id }}", - "proposalId": "{{ proposal_id }}", - "ownerGroup": "{{ owner_group }}", + ]{% endif %}{% endfor %}, + "instrumentId": "{{ instrumentId }}", + "sampleId": "{{ sampleId }}", + "proposalId": "{{ proposalId }}", + "ownerGroup": "{{ ownerGroup }}", "accessGroups": [ - {% for access_group in access_groups %}"{{ access_group }}"{% if not loop.last %}, - {% endif %}{% endfor %} + {% for accessGroup in accessGroups %}"{{ accessGroup }}"{% if not loop.last %},{% endif %}{% endfor %} ] } diff --git a/src/scicat_schemas/load_template.py b/src/scicat_schemas/load_template.py index 070aadc..abca2ae 100644 --- a/src/scicat_schemas/load_template.py +++ b/src/scicat_schemas/load_template.py @@ -5,23 +5,23 @@ from jinja2 import Template _CUR_DIR = pathlib.Path(__file__).parent -_SINGLE_TEMPLATE_PATH = _CUR_DIR / pathlib.Path("single_datafile.json.jinja") +_DATAFILELIST_ITEM_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path("datafilelist_item.schema.json.jinja") _DATASET_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path("dataset.schema.json.jinja") -_ORIG_DATABLOCK_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path( +_ORIGDATABLOCK_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path( "origdatablock.schema.json.jinja" ) -def load_single_datafile_template() -> Template: +def load_datafilelist_item_schema_template() -> Template: """Load the template for the single datafile schema.""" - return Template((_SINGLE_TEMPLATE_PATH).read_text()) + return Template((_DATAFILELIST_ITEM_SCHEMA_TEMPLATE_PATH).read_text()) def load_dataset_schema_template() -> Template: """Load the template for the dataset schema.""" - return Template((_CUR_DIR / _DATASET_SCHEMA_TEMPLATE_PATH).read_text()) + return Template((_DATASET_SCHEMA_TEMPLATE_PATH).read_text()) def load_origdatablock_schema_template() -> Template: """Load the template for the original data block schema.""" - return Template((_CUR_DIR / _ORIG_DATABLOCK_SCHEMA_TEMPLATE_PATH).read_text()) + return Template((_ORIGDATABLOCK_SCHEMA_TEMPLATE_PATH).read_text()) diff --git a/src/scicat_schemas/origdatablock.schema.json.jinja b/src/scicat_schemas/origdatablock.schema.json.jinja index 2ab2aa3..1038506 100644 --- a/src/scicat_schemas/origdatablock.schema.json.jinja +++ b/src/scicat_schemas/origdatablock.schema.json.jinja @@ -1,9 +1,9 @@ { - "datasetId": "{{ dataset_pid }}", - "size": {{ dataset_size }}, - "chkAlg": "{{ check_algorithm }}", + "datasetId": "{{ datasetId }}", + "size": {{ size }}, + "chkAlg": "{{ chkAlg }}", "dataFileList": [ - {% for data_file_desc in data_file_desc_list %}{{ data_file_desc }}{% if not loop.last %}, + {% for dataFileList_item in dataFileList %}{{ dataFileList_item }}{% if not loop.last %}, {% endif %}{% endfor %} ] } diff --git a/src/scicat_schemas/single_datafile.json.jinja b/src/scicat_schemas/single_datafile.json.jinja deleted file mode 100644 index da87864..0000000 --- a/src/scicat_schemas/single_datafile.json.jinja +++ /dev/null @@ -1,9 +0,0 @@ -{ - "path": "{{ file_absolute_path }}", - "size": {{ file_size }}, - "time": "{{ datetime_isoformat }}",{% if checksum %} - "chk": "{{ checksum }}", - {% endif %}"uid": "{{ uid }}", - "gid": "{{ gid }}", - "perm": "{{ perm }}" -} From 6dd69e9eac0f4a1cf1074a32fb18ce52599b89a9 Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Thu, 1 Aug 2024 13:29:34 +0200 Subject: [PATCH 03/25] finished coding offline ingestor --- resources/config.sample.json | 9 +- src/scicat_configuration.py | 6 +- src/scicat_offline_ingestor.py | 170 +++++++++++++++++++++++++-------- src/scicat_online_ingestor.py | 4 +- src/system_helpers.py | 28 ++++-- 5 files changed, 159 insertions(+), 58 deletions(-) diff --git a/resources/config.sample.json b/resources/config.sample.json index be44f04..5c5129b 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -3,14 +3,13 @@ "id": "", "dataset": { "check_by_job_id": true, - "use_job_id_as_dataset_id": true, - "generate_dataset_pid": true, + "allow_dataset_pid": true, + "generate_dataset_pid": false, "dataset_pid_prefix": "20.500.12269", "default_instrument_id": "ID_OF_FALLBACK_INSTRUMENT", - "default_instrument_name": "FALLBACK_INSTRUMENT_NAME", "default_proposal_id": "DEFAULT_PROPOSAL_ID", - "default_ownerGroup": "DEFAULT_OWNER_GROUP", - "default_accessGroups": [ + "default_owner_group": "DEFAULT_OWNER_GROUP", + "default_access_groups": [ "ACCESS_GROUP_1" ] }, diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index 4a6132e..b0c03d5 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -275,12 +275,12 @@ def from_configurations(cls, config: dict) -> "IngestionOptions": class DatasetOptions: check_by_job_id: bool = True, allow_dataset_pid: bool = True, + generate_dataset_pid: bool = False, dataset_pid_prefix: str = "20.500.12269", default_instrument_id: str = "", - default_instrument_name: str = "", default_proposal_id: str = "", - default_ownerGroup: str = "", - default_accessGroups: list[str] = field(default_factory=list) + default_owner_group: str = "", + default_access_groups: list[str] = field(default_factory=list) @classmethod def from_configurations(cls, config: dict) -> "DatasetOptions": diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 9f1d50a..e3412cf 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -7,6 +7,7 @@ import json import logging import pathlib +import uuid from urllib.parse import urljoin import os @@ -14,19 +15,17 @@ import pytz import requests from scicat_configuration import ( - BackgroundIngestorConfig, - build_background_ingestor_arg_parser, - build_scicat_background_ingester_config, + OfflineIngestorConfig, + build_offline_ingestor_arg_parser, + build_scicat_offline_ingestor_config, ) from scicat_dataset import ( - build_single_data_file_desc, convert_to_type, - save_and_build_single_hash_file_desc, ) from scicat_logging import build_logger from scicat_metadata import collect_schemas, select_applicable_schema from src.scicat_path_helpers import compose_ingestor_directory, compose_ingestor_output_file_path -from system_helpers import exit_at_exceptions +from system_helpers import offline_ingestor_exit_at_exceptions, exit def replace_variables_values(url: str, values: dict) -> str: @@ -36,7 +35,7 @@ def replace_variables_values(url: str, values: dict) -> str: def extract_variables_values( - variables: dict, h5file, config: BackgroundIngestorConfig + variables: dict, h5file, config: OfflineIngestorConfig ) -> dict: values = {} @@ -122,7 +121,7 @@ def _create_datafilelist_item( "time": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"), } - if config.ingestion_options.compute_files_stats and file_full_path.exists(): + if config.ingestion.compute_files_stats and file_full_path.exists(): logger.info("create_datafilelist_item: reading file stats from disk") stats = file_full_path.stat() datafiles_item = { @@ -147,17 +146,17 @@ def _compute_file_checksum_if_needed( checksum = "" datafiles_item = {} - if config.ingestion_options.compute_files_hash and os.path.exists(file_full_path): + if config.ingestion.compute_files_hash and os.path.exists(file_full_path): logger.info("create_datafiles_entry: computing hash of the file from disk") - checksum = _compute_file_checksum(file_full_path, config.ingestion_options.file_hash_algorithm) + checksum = _compute_file_checksum(file_full_path, config.ingestion.file_hash_algorithm) - if config.ingstion_options.save_hash_in_file: + if config.ingstion.save_hash_in_file: # file path for hash file hash_file_full_path = compose_ingestor_output_file_path( ingestor_directory, file_full_path.stem, - config.ingestion_options.hash_file_extension) + config.ingestion.hash_file_extension) logger.info("create_datafiles_entry: saving hash in file {}".format(hash_file_full_path)) # save hash in file @@ -199,7 +198,7 @@ def _create_datafiles_list( if datafiles_hash_item: datafiles_list.append(datafiles_hash_item) - if config.kafka_options.message_saving_options.message_to_file: + if config.ingestion.file_handling.message_to_file: logger.info("create_datafiles_list: adding done writing message file {}".format(done_writing_message_file_path)) datafiles_list.append( _create_datafilelist_item(done_writing_message_file_path, config, logger) @@ -216,10 +215,11 @@ def _create_datafiles_list( return datafiles_list -def _prepare_scicat_dataset(metadata_schema, values): +def _prepare_scicat_dataset(metadata_schema, values, config, logger): """ Prepare scicat dataset as dictionary ready to be ``POST``ed. """ + logger.info("_prepare_scicat_dataset: Preparing scicat dataset structure") schema: dict = metadata_schema["schema"] dataset = {} @@ -254,24 +254,49 @@ def _prepare_scicat_dataset(metadata_schema, values): dataset["scientific_metadata"] = scientific_metadata + # now check that the configuration setting shave been respected + if not config.dataset.allow_dataset_pid and "pid" in dataset.keys(): + logger.info("_prepare_scicat_dataset: Pid not allowed by configuration") + del dataset["pid"] + if config.dataset.generate_dataset_pid: + logger.info("_prepare_scicat_dataset: Auto generating pid by configuration") + dataset["pid"] = str(uuid.uuid4()) + + if "instrumentId" not in dataset.keys() or not dataset["instrumentId"]: + logger.info("_prepare_scicat_dataset: Assigning default instrument id: {}".format(config.dataset.default_instrument_id)) + dataset["instrumentId"] = config.dataset.default_instrument_id + + if "proposalId" not in dataset.keys() or not dataset["proposalId"]: + logger.info("_prepare_scicat_dataset: Assigning default proposal id: {}".format(config.dataset.default_proposal_id)) + dataset["proposalId"] = config.dataset.default_proposal_id + + if "ownerGroup" not in dataset.keys() or not dataset["ownerGroup"]: + logger.info("_prepare_scicat_dataset: Assigning default ownerGroup: {}".format(config.dataset.default_owner_group)) + dataset["ownerGroup"] = config.dataset.default_owner_group + + if "accessGroups" not in dataset.keys() or not dataset["accessGroups"]: + logger.info("_prepare_scicat_dataset: Assigning default accessGroups: {}".format(json.dumps(config.dataset.default_access_groups))) + dataset["accessGroups"] = config.dataset.default_access_groups + + logger.info("_prepare_scicat_dataset: Scicat dataset: {}".format(json.dumps(dataset))) return dataset def _create_scicat_dataset( dataset: dict, - config: dict, + config, logger: logging.Logger ) -> dict: """ Execute a POST request to scicat to create a dataset """ - logger.info("create_scicat_dataset: Sending POST request to create new dataset") + logger.info("_create_scicat_dataset: Sending POST request to create new dataset") response = requests.request( method="POST", - url=urljoin(config["scicat_url"], "datasets"), + url=urljoin(config.scicat.host, "datasets"), json=dataset, - headers=config["scicat_headers"], - timeout=config["timeout_seconds"], + headers=config.scicat.headers, + timeout=config.scicat.timeout, stream=False, verify=True, ) @@ -279,16 +304,63 @@ def _create_scicat_dataset( result = response.json() if not response.ok: err = result.get("error", {}) - logger.info(f"create_scicat_dataset: Failed to create new dataset. Error {err}") + logger.info(f"_create_scicat_dataset: Failed to create new dataset. Error {err}") raise Exception(f"Error creating new dataset: {err}") - logger.info("create_scicat_dataset: Dataset create successfully. Dataset pid: %s", result['pid']) + logger.info("_create_scicat_dataset: Dataset created successfully. Dataset pid: %s", result['pid']) return result +def _prepare_scicat_origdatablock( + scicat_dataset, + datafileslist, + config, + logger +): + """ + Create local copy of the orig datablock to send to scicat + """ + logger.info("_prepare_scicat_origdatablock: Preparing scicat origdatablock structure") + origdatablock = { + "ownerGroup": scicat_dataset["ownerGroup"], + "accessGroups": scicat_dataset["accessGroups"], + "size": sum([item["size"] for item in datafileslist]), + "chkAlg": config.ingestion.file_hash_algorithm, + "dataFileList": datafileslist, + "datasetId": scicat_dataset["pid"], + } + + logger.info("_prepare_scicat_origdatablock: Scicat origdatablock: {}".format(json.dumps(origdatablock))) + return origdatablock + + def _create_scicat_origdatablock( - scicat_dataset_pid, nexus_file=None, done_writing_message_file=None -): ... + origdatablock: dict, + config, + logger: logging.Logger +) -> dict: + """ + Execute a POST request to scicat to create a new origdatablock + """ + logger.info("_create_scicat_origdatablock: Sending POST request to create new origdatablock") + response = requests.request( + method="POST", + url=urljoin(config.scicat.host, "origdatablocks"), + json=origdatablock, + headers=config.scicat.headers, + timeout=config.scicat.timeout, + stream=False, + verify=True, + ) + + result = response.json() + if not response.ok: + err = result.get("error", {}) + logger.info(f"_create_scicat_origdatablock: Failed to create new origdatablock. Error {err}") + raise Exception(f"Error creating new origdatablock: {err}") + + logger.info("_create_scicat_origdatablock: Origdatablock created successfully. Origdatablock pid: %s", result['_id']) + return result def _define_dataset_source_folder( @@ -325,11 +397,11 @@ def _prepare_origdatablock_datafilelist( def main() -> None: """Main entry point of the app.""" - arg_parser = build_background_ingestor_arg_parser() + arg_parser = build_offline_ingestor_arg_parser() arg_namespace = arg_parser.parse_args() - config = build_scicat_background_ingester_config(arg_namespace) - ingestion_options = config.ingestion_options - file_handling_options = ingestion_options.file_handling_options + config = build_scicat_offline_ingestor_config(arg_namespace) + ingestion_options = config.ingestion + file_handling_options = ingestion_options.file_handling logger = build_logger(config) # Log the configuration as dictionary so that it is easier to read from the logs @@ -339,18 +411,18 @@ def main() -> None: logger.info(config.to_dict()) # Collect all metadata schema configurations - schemas = collect_schemas(ingestion_options.schema_directory) + schemas = collect_schemas(ingestion_options.schemas_directory) - with exit_at_exceptions(logger, daemon=False): - nexus_file_path = pathlib.Path(config.single_run_options.nexus_file) + with offline_ingestor_exit_at_exceptions(logger): + nexus_file_path = pathlib.Path(config.offline_run.nexus_file) logger.info( "Nexus file to be ingested : %s", nexus_file_path, ) done_writing_message_file_path = pathlib.Path() - if config.kafka_options.message_saving_options.message_to_file: + if config.ingestion.file_handling.message_to_file: done_writing_message_file_path = pathlib.Path( - config.single_run_options.done_writing_message_file) + config.offline_run.done_writing_message_file) logger.info( "Done writing message file linked to nexus file : %s", done_writing_message_file_path @@ -361,7 +433,7 @@ def main() -> None: # define which is the directory where the ingestor should save the files it creates, if any is created ingestor_directory = compose_ingestor_directory( - config.ingestion_options.file_handling_options, + config.ingestion.file_handling, nexus_file_path ) @@ -416,21 +488,35 @@ def main() -> None: ) # create and populate scicat dataset entry - scicat_dataset = _prepare_scicat_dataset(metadata_schema, variables_values) + local_dataset = _prepare_scicat_dataset( + metadata_schema, + variables_values, + config, + logger + ) # create dataset in scicat - scicat_dataset = _create_scicat_dataset(scicat_dataset, config) - scicat_dataset_pid = scicat_dataset["pid"] + scicat_dataset = _create_scicat_dataset( + local_dataset, + config, + logger + ) # create and populate scicat origdatablock entry # with files and hashes previously computed - scicat_origdatablock = _create_scicat_origdatablock( - scicat_dataset_pid, - origdatablock_datafiles_list + local_origdatablock = _prepare_scicat_origdatablock( + scicat_dataset, + origdatablock_datafiles_list, + config, + logger ) # create origdatablock in scicat - scicat_origdatablock_id = create_scicat_origdatablock(scicat_origdatablock) + scicat_origdatablock = _create_scicat_origdatablock( + local_origdatablock, + config, + logger + ) - # return successful code - return scicat_origdatablock_id + # check one more time if we successfully created the entries in scicat + exit(logger, unexpected=(bool(scicat_dataset) and bool(scicat_origdatablock))) diff --git a/src/scicat_online_ingestor.py b/src/scicat_online_ingestor.py index ea6a7d1..33fb1b9 100644 --- a/src/scicat_online_ingestor.py +++ b/src/scicat_online_ingestor.py @@ -26,7 +26,7 @@ ) from scicat_logging import build_logger from scicat_path_helpers import compose_ingestor_output_file_path, compose_ingestor_directory -from system_helpers import exit_at_exceptions +from system_helpers import online_ingestor_exit_at_exceptions def dump_message_to_file_if_needed( @@ -66,7 +66,7 @@ def main() -> None: # Often used options message_saving_options = config.kafka_options.message_saving_options - with exit_at_exceptions(logger): + with online_ingestor_exit_at_exceptions(logger): # Kafka consumer if (consumer := build_consumer(config.kafka_options, logger)) is None: raise RuntimeError("Failed to build the Kafka consumer") diff --git a/src/system_helpers.py b/src/system_helpers.py index ed88a33..c9e1aa4 100644 --- a/src/system_helpers.py +++ b/src/system_helpers.py @@ -3,7 +3,7 @@ from contextlib import contextmanager -def quit(logger: logging.Logger, unexpected: bool = True) -> None: +def exit(logger: logging.Logger, unexpected: bool = True) -> None: """Log the message and exit the program.""" import sys @@ -12,7 +12,7 @@ def quit(logger: logging.Logger, unexpected: bool = True) -> None: @contextmanager -def exit_at_exceptions( +def online_ingestor_exit_at_exceptions( logger: logging.Logger, daemon: bool = True ) -> Generator[None, None, None]: """Exit the program if an exception is raised.""" @@ -20,14 +20,30 @@ def exit_at_exceptions( yield except KeyboardInterrupt: logger.info("Received keyboard interrupt.") - quit(logger, unexpected=False) + exit(logger, unexpected=False) except Exception as e: logger.error("An exception occurred: %s", e) - quit(logger, unexpected=True) + exit(logger, unexpected=True) else: if daemon: logger.error("Loop finished unexpectedly.") - quit(logger, unexpected=True) + exit(logger, unexpected=True) else: logger.info("Finished successfully.") - quit(logger, unexpected=False) + exit(logger, unexpected=False) + +@contextmanager +def offline_ingestor_exit_at_exceptions( + logger: logging.Logger +) -> Generator[None, None, None]: + """ + manage exceptions specifically for offline ingestor + """ + try: + yield + except Exception as e: + logger.error("An exception occurred: %s", e) + else: + logger.error("An unexpected error occurred") + + exit(logger, unexpected=True) From 3d89d3742134ffe7d36c97c0c255237c0b6aaeac Mon Sep 17 00:00:00 2001 From: Max Novelli Date: Thu, 1 Aug 2024 14:40:07 +0200 Subject: [PATCH 04/25] fixed online ingestor --- resources/config.sample.json | 1 + src/scicat_configuration.py | 1 + src/scicat_logging.py | 35 +++++++------- src/scicat_offline_ingestor.py | 21 +++++++-- src/scicat_online_ingestor.py | 86 +++++++++++++++++++++++++--------- 5 files changed, 99 insertions(+), 45 deletions(-) diff --git a/resources/config.sample.json b/resources/config.sample.json index 5c5129b..124882e 100644 --- a/resources/config.sample.json +++ b/resources/config.sample.json @@ -15,6 +15,7 @@ }, "ingestion": { "dry_run": false, + "offline_ingestor_executable" : "./scicat_offline_ingestor.py", "schemas_directory": "schemas", "file_handling": { "compute_file_stats": true, diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index b0c03d5..a18ac63 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -260,6 +260,7 @@ class IngestionOptions: file_handling: FileHandlingOptions dry_run: bool = False schemas_directory: str = "schemas" + offline_ingestor_executable: str = "./scicat_offline_ingestor.py" @classmethod def from_configurations(cls, config: dict) -> "IngestionOptions": diff --git a/src/scicat_logging.py b/src/scicat_logging.py index e25453a..9c7e089 100644 --- a/src/scicat_logging.py +++ b/src/scicat_logging.py @@ -5,19 +5,21 @@ import logging.handlers import graypy -from scicat_configuration import IngesterConfig +from scicat_configuration import OnlineIngestorConfig, OfflineIngestorConfig -def build_logger(config: IngesterConfig) -> logging.Logger: +def build_logger( + config: OnlineIngestorConfig | OfflineIngestorConfig +) -> logging.Logger: """Build a logger and configure it according to the ``config``.""" - run_options = config.run_options + logging_options = config.logging # Build logger and formatter logger = logging.getLogger('esd extract parameters') formatter = logging.Formatter( " - ".join( ( - run_options.log_message_prefix, + logging_options.log_message_prefix, '%(asctime)s', '%(name)s', '%(levelname)s', @@ -27,9 +29,9 @@ def build_logger(config: IngesterConfig) -> logging.Logger: ) # Add FileHandler - if run_options.file_log: - file_name_components = [run_options.file_log_base_name] - if run_options.file_log_timestamp: + if logging_options.file_log: + file_name_components = [logging_options.file_log_base_name] + if logging_options.file_log_timestamp: file_name_components.append( datetime.datetime.now(datetime.UTC).strftime('%Y%m%d%H%M%S%f') ) @@ -40,30 +42,29 @@ def build_logger(config: IngesterConfig) -> logging.Logger: logger.addHandler(file_handler) # Add SysLogHandler - if run_options.system_log: + if logging_options.system_log: logger.addHandler(logging.handlers.SysLogHandler(address='/dev/log')) # Add graylog handler - if run_options.graylog: - graylog_config = config.graylog_options + if logging_options.graylog: graylog_handler = graypy.GELFTCPHandler( - graylog_config.host, - int(graylog_config.port), - facility=graylog_config.facility, + logging_options.graylog_host, + int(logging_options.graylog_port), + facility=logging_options.graylog_facility, ) logger.addHandler(graylog_handler) # Set the level and formatter for all handlers - logger.setLevel(run_options.logging_level) + logger.setLevel(logging_options.logging_level) for handler in logger.handlers: - handler.setLevel(run_options.logging_level) + handler.setLevel(logging_options.logging_level) handler.setFormatter(formatter) # Add StreamHandler # streamer handler is added last since it is using different formatter - if run_options.verbose: + if logging_options.verbose: from rich.logging import RichHandler - logger.addHandler(RichHandler(level=run_options.logging_level)) + logger.addHandler(RichHandler(level=logging_options.logging_level)) return logger diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index e3412cf..1e8cf85 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -215,13 +215,19 @@ def _create_datafiles_list( return datafiles_list -def _prepare_scicat_dataset(metadata_schema, values, config, logger): +def _prepare_scicat_dataset( + metadata_schema: dict, + values: dict, + datafilelist: list[dict], + config, + logger +): """ Prepare scicat dataset as dictionary ready to be ``POST``ed. """ logger.info("_prepare_scicat_dataset: Preparing scicat dataset structure") schema: dict = metadata_schema["schema"] - dataset = {} + dataset: dict = {} scientific_metadata = { 'ingestor_metadata_schema_id': { @@ -278,6 +284,10 @@ def _prepare_scicat_dataset(metadata_schema, values, config, logger): logger.info("_prepare_scicat_dataset: Assigning default accessGroups: {}".format(json.dumps(config.dataset.default_access_groups))) dataset["accessGroups"] = config.dataset.default_access_groups + dataset["size"] = len(datafilelist) + dataset["numberOfFiles"] = sum([item["size"] for item in datafilelist]) + dataset["isPublished"] = False + logger.info("_prepare_scicat_dataset: Scicat dataset: {}".format(json.dumps(dataset))) return dataset @@ -313,7 +323,7 @@ def _create_scicat_dataset( def _prepare_scicat_origdatablock( scicat_dataset, - datafileslist, + datafilelist, config, logger ): @@ -324,9 +334,9 @@ def _prepare_scicat_origdatablock( origdatablock = { "ownerGroup": scicat_dataset["ownerGroup"], "accessGroups": scicat_dataset["accessGroups"], - "size": sum([item["size"] for item in datafileslist]), + "size": sum([item["size"] for item in datafilelist]), "chkAlg": config.ingestion.file_hash_algorithm, - "dataFileList": datafileslist, + "dataFileList": datafilelist, "datasetId": scicat_dataset["pid"], } @@ -491,6 +501,7 @@ def main() -> None: local_dataset = _prepare_scicat_dataset( metadata_schema, variables_values, + datafilelist, config, logger ) diff --git a/src/scicat_online_ingestor.py b/src/scicat_online_ingestor.py index 33fb1b9..72f3fbd 100644 --- a/src/scicat_online_ingestor.py +++ b/src/scicat_online_ingestor.py @@ -5,6 +5,7 @@ import importlib.metadata import logging import pathlib +import subprocess try: __version__ = importlib.metadata.version(__package__ or __name__) @@ -14,9 +15,9 @@ del importlib from scicat_configuration import ( - MessageSavingOptions, - build_main_arg_parser, - build_scicat_ingestor_config, + build_online_arg_parser, + build_scicat_online_ingestor_config, + FileHandlingOptions, ) from scicat_kafka import ( WritingFinished, @@ -33,11 +34,11 @@ def dump_message_to_file_if_needed( *, logger: logging.Logger, message_file_path: pathlib.Path, - message_saving_options: MessageSavingOptions, + file_handling_options: FileHandlingOptions, message: WritingFinished, ) -> None: """Dump the message to a file according to the configuration.""" - if not message_saving_options.message_to_file: + if not file_handling_options.message_to_file: logger.info("Message saving to file is disabled. Skipping saving message.") return elif not message_file_path.parent.exists(): @@ -52,25 +53,39 @@ def dump_message_to_file_if_needed( logger.info("Message file saved") +def _individual_message_commit(offline_ingestors, consumer, logger): + logger.info("{} offline ingestors running".format(len(offline_ingestors))) + for job_id, job_item in offline_ingestors.items(): + result = job_item["proc"].poll() + if result is not None: + logger.info("Offline ingestor for job id {} ended with result {}".format(job_id,result)) + if result == 0: + logger.info("Executing commit for message with job id {}".format(job_id)) + consumer.commit(message=job_item["message"]) + logger.info("Removed ingestor for message with job id {} from queue".format(job_id)) + offline_ingestors.pop(job_id) + + def main() -> None: """Main entry point of the app.""" - arg_parser = build_main_arg_parser() + arg_parser = build_online_arg_parser() arg_namespace = arg_parser.parse_args() - config = build_scicat_ingestor_config(arg_namespace) + config = build_scicat_online_ingestor_config(arg_namespace) logger = build_logger(config) # Log the configuration as dictionary so that it is easier to read from the logs logger.info('Starting the Scicat online Ingestor with the following configuration:') logger.info(config.to_dict()) - # Often used options - message_saving_options = config.kafka_options.message_saving_options - with online_ingestor_exit_at_exceptions(logger): # Kafka consumer - if (consumer := build_consumer(config.kafka_options, logger)) is None: + if (consumer := build_consumer(config.kafka, logger)) is None: raise RuntimeError("Failed to build the Kafka consumer") + # this is the dictionary that contains the list of offline ingestor running + offline_ingestors: dict = {} + + # Receive messages for message in wrdn_messages(consumer, logger): logger.info("Processing message: %s", message) @@ -78,22 +93,26 @@ def main() -> None: # Check if we have received a WRDN message. # ``message: None | WritingFinished`` if message: + # extract job id + job_id = message.job_id # Extract nexus file path from the message. nexus_file_path = pathlib.Path(message.file_name) ingestor_directory = compose_ingestor_directory( - config.ingestion_options.file_handling_options, + config.ingestion.file_handling, nexus_file_path ) + done_writing_message_file_path = compose_ingestor_output_file_path( + ingestor_directory=ingestor_directory, + file_name=nexus_file_path.stem, + file_extension=config.ingestion.file_handling.message_file_extension, + ) dump_message_to_file_if_needed( logger=logger, - message_saving_options=message_saving_options, + file_handling_options=config.ingestion.file_handling, message=message, - message_file_path=compose_ingestor_output_file_path( - ingestor_directory=ingestor_directory, - file_name=nexus_file_path.stem, - file_extension=message_saving_options.message_file_extension, - ), + message_file_path=done_writing_message_file_path, ) + # instantiate a new process and runs background ingestor # on the nexus file # use open process and wait for outcome @@ -105,11 +124,32 @@ def main() -> None: -m message_file_path # optional depending on the # message_saving_options.message_output """ + cmd = [ + config.ingestion.offline_ingestor_executable, + "-c", + arg_namespace.config_file, + "-f", + nexus_file_path, + "-j", + job_id + ] + if config.ingestion.file_handling.message_to_file: + cmd += [ + "-m", + done_writing_message_file_path + ] + proc = subprocess.Popen(cmd) + # save info about the background process + offline_ingestors[job_id] = { + "proc" : proc, + "message": message, + } # if background process is successful # check if we need to commit the individual message - """ - if config.kafka_options.individual_message_commit \ - and background_process is successful: - consumer.commit(message=message) - """ + if config.kafka.individual_message_commit: + _individual_message_commit( + offline_ingestors, + consumer, + logger) + From e4e285571f4f0ecb7d2d385c125b7364dbacdd48 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Mon, 5 Aug 2024 21:40:47 +0200 Subject: [PATCH 05/25] Make ruff happy --- src/scicat_online_ingestor.py | 42 +++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/scicat_online_ingestor.py b/src/scicat_online_ingestor.py index 72f3fbd..eb04123 100644 --- a/src/scicat_online_ingestor.py +++ b/src/scicat_online_ingestor.py @@ -15,9 +15,9 @@ del importlib from scicat_configuration import ( + FileHandlingOptions, build_online_arg_parser, build_scicat_online_ingestor_config, - FileHandlingOptions, ) from scicat_kafka import ( WritingFinished, @@ -26,7 +26,10 @@ wrdn_messages, ) from scicat_logging import build_logger -from scicat_path_helpers import compose_ingestor_output_file_path, compose_ingestor_directory +from scicat_path_helpers import ( + compose_ingestor_directory, + compose_ingestor_output_file_path, +) from system_helpers import online_ingestor_exit_at_exceptions @@ -53,16 +56,20 @@ def dump_message_to_file_if_needed( logger.info("Message file saved") -def _individual_message_commit(offline_ingestors, consumer, logger): - logger.info("{} offline ingestors running".format(len(offline_ingestors))) +def _individual_message_commit(offline_ingestors, consumer, logger: logging.Logger): + logger.info("%s offline ingestors running", len(offline_ingestors)) for job_id, job_item in offline_ingestors.items(): result = job_item["proc"].poll() if result is not None: - logger.info("Offline ingestor for job id {} ended with result {}".format(job_id,result)) + logger.info( + "Offline ingestor for job id %s ended with result %s", job_id, result + ) if result == 0: - logger.info("Executing commit for message with job id {}".format(job_id)) + logger.info("Executing commit for message with job id %s", job_id) consumer.commit(message=job_item["message"]) - logger.info("Removed ingestor for message with job id {} from queue".format(job_id)) + logger.info( + "Removed ingestor for message with job id %s from queue", job_id + ) offline_ingestors.pop(job_id) @@ -85,7 +92,6 @@ def main() -> None: # this is the dictionary that contains the list of offline ingestor running offline_ingestors: dict = {} - # Receive messages for message in wrdn_messages(consumer, logger): logger.info("Processing message: %s", message) @@ -98,8 +104,7 @@ def main() -> None: # Extract nexus file path from the message. nexus_file_path = pathlib.Path(message.file_name) ingestor_directory = compose_ingestor_directory( - config.ingestion.file_handling, - nexus_file_path + config.ingestion.file_handling, nexus_file_path ) done_writing_message_file_path = compose_ingestor_output_file_path( ingestor_directory=ingestor_directory, @@ -131,25 +136,18 @@ def main() -> None: "-f", nexus_file_path, "-j", - job_id + job_id, ] if config.ingestion.file_handling.message_to_file: - cmd += [ - "-m", - done_writing_message_file_path - ] - proc = subprocess.Popen(cmd) + cmd += ["-m", done_writing_message_file_path] + proc = subprocess.Popen(cmd) # noqa: S603 # save info about the background process offline_ingestors[job_id] = { - "proc" : proc, + "proc": proc, "message": message, } # if background process is successful # check if we need to commit the individual message if config.kafka.individual_message_commit: - _individual_message_commit( - offline_ingestors, - consumer, - logger) - + _individual_message_commit(offline_ingestors, consumer, logger) From c80e759d55e98b8b24a4b1a1d20e47e70320dff1 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Mon, 5 Aug 2024 21:53:07 +0200 Subject: [PATCH 06/25] Make ruff happy. --- resources/base.imsc.json.example | 2 +- src/scicat_configuration.py | 61 +++++--- src/scicat_dataset.py | 7 +- src/scicat_kafka.py | 8 +- src/scicat_logging.py | 4 +- src/scicat_offline_ingestor.py | 235 +++++++++++++++------------- src/scicat_path_helpers.py | 11 +- src/scicat_schemas/load_template.py | 4 +- src/system_helpers.py | 3 +- tests/test_scicat_schema.py | 10 +- 10 files changed, 188 insertions(+), 157 deletions(-) diff --git a/resources/base.imsc.json.example b/resources/base.imsc.json.example index 3d3d00e..d6b443d 100644 --- a/resources/base.imsc.json.example +++ b/resources/base.imsc.json.example @@ -3,7 +3,7 @@ "name" : "Generic metadata schema" "instrument" : "", "selector" : "filename:starts_with:/ess/data", - "variables" : { + "variables" : { "pid": { "source": "NXS", "path": "/entry/entry_identifier_uuid", diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py index a18ac63..6f8557e 100644 --- a/src/scicat_configuration.py +++ b/src/scicat_configuration.py @@ -21,18 +21,20 @@ def _load_config(config_file: Any) -> dict: def _merge_config_options( - config_dict: dict, - input_args_dict: dict, - keys: list[str] | None = None + config_dict: dict, input_args_dict: dict, keys: list[str] | None = None ) -> dict: """Merge configuration from the configuration file and input arguments.""" - if keys == None: - keys = config_dict.keys(); + if keys is None: + keys = config_dict.keys() return { **config_dict.setdefault("options", {}), - **{key: input_args_dict[key] for key in keys if input_args_dict[key] is not None}, + **{ + key: input_args_dict[key] + for key in keys + if input_args_dict[key] is not None + }, } @@ -75,7 +77,8 @@ def build_online_arg_parser() -> argparse.ArgumentParser: type=str, ) group.add_argument( - "-d", "--dry-run", + "-d", + "--dry-run", dest="dry_run", help="Dry run. Does not produce any output file nor modify entry in SciCat", action="store_true", @@ -255,6 +258,7 @@ class FileHandlingOptions: message_to_file: bool = True message_file_extension: str = "message.json" + @dataclass class IngestionOptions: file_handling: FileHandlingOptions @@ -274,13 +278,13 @@ def from_configurations(cls, config: dict) -> "IngestionOptions": @dataclass class DatasetOptions: - check_by_job_id: bool = True, - allow_dataset_pid: bool = True, - generate_dataset_pid: bool = False, - dataset_pid_prefix: str = "20.500.12269", - default_instrument_id: str = "", - default_proposal_id: str = "", - default_owner_group: str = "", + check_by_job_id: bool = (True,) + allow_dataset_pid: bool = (True,) + generate_dataset_pid: bool = (False,) + dataset_pid_prefix: str = ("20.500.12269",) + default_instrument_id: str = ("",) + default_proposal_id: str = ("",) + default_owner_group: str = ("",) default_access_groups: list[str] = field(default_factory=list) @classmethod @@ -302,9 +306,7 @@ class SciCatOptions: def from_configurations(cls, config: dict) -> "SciCatOptions": """Create SciCatOptions from a dictionary.""" options = cls(**config) - options.headers = { - "Authorization": "Bearer {}".format(options.token) - } + options.headers = {"Authorization": f"Bearer {options.token}"} return options @@ -335,16 +337,22 @@ def to_dict(self) -> dict: ) -def build_scicat_online_ingestor_config(input_args: argparse.Namespace) -> OnlineIngestorConfig: +def build_scicat_online_ingestor_config( + input_args: argparse.Namespace, +) -> OnlineIngestorConfig: """Merge configuration from the configuration file and input arguments.""" config_dict = _load_config(input_args.config_file) - logging_dict = _merge_config_options(config_dict.setdefault("logging",{}), vars(input_args)) - ingestion_dict = _merge_config_options(config_dict.setdefault("ingestion",{}), vars(input_args), ["dry-run"]) + logging_dict = _merge_config_options( + config_dict.setdefault("logging", {}), vars(input_args) + ) + ingestion_dict = _merge_config_options( + config_dict.setdefault("ingestion", {}), vars(input_args), ["dry-run"] + ) # Wrap configuration in a dataclass return OnlineIngestorConfig( original_dict=_freeze_dict_items(config_dict), - dataset=DatasetOptions(**config_dict.setdefault("dataset",{})), + dataset=DatasetOptions(**config_dict.setdefault("dataset", {})), ingestion=IngestionOptions.from_configurations(ingestion_dict), kafka=KafkaOptions(**config_dict.setdefault("kafka", {})), logging=LoggingOptions(**logging_dict), @@ -359,6 +367,7 @@ class OfflineRunOptions: done_writing_message_file: str """Full path of the done writing message file that match the ``nexus_file``.""" + @dataclass class OfflineIngestorConfig(OnlineIngestorConfig): offline_run: OfflineRunOptions @@ -388,8 +397,12 @@ def build_scicat_offline_ingestor_config( """Merge configuration from the configuration file and input arguments.""" config_dict = _load_config(input_args.config_file) input_args_dict = vars(input_args) - logging_dict = _merge_config_options(config_dict.setdefault("logging",{}), input_args_dict) - ingestion_dict = _merge_config_options(config_dict.setdefault("ingestion",{}), input_args_dict, ["dry-run"]) + logging_dict = _merge_config_options( + config_dict.setdefault("logging", {}), input_args_dict + ) + ingestion_dict = _merge_config_options( + config_dict.setdefault("ingestion", {}), input_args_dict, ["dry-run"] + ) offline_run_option_dict = { "nexus_file": input_args_dict.pop("nexus_file"), "done_writing_message_file": input_args_dict.pop("done_writing_message_file"), @@ -398,7 +411,7 @@ def build_scicat_offline_ingestor_config( # Wrap configuration in a dataclass return OfflineIngestorConfig( original_dict=_freeze_dict_items(config_dict), - dataset=DatasetOptions(**config_dict.setdefault("dataset",{})), + dataset=DatasetOptions(**config_dict.setdefault("dataset", {})), ingestion=IngestionOptions.from_configurations(ingestion_dict), kafka=KafkaOptions(**config_dict.setdefault("kafka", {})), logging=LoggingOptions(**logging_dict), diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index ad2acb1..8c67d14 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -1,15 +1,16 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) import datetime +import json import pathlib from types import MappingProxyType from typing import Any from scicat_configuration import FileHandlingOptions from scicat_schemas import ( + load_datafilelist_item_schema_template, load_dataset_schema_template, load_origdatablock_schema_template, - load_datafilelist_item_schema_template, ) @@ -114,7 +115,7 @@ def build_single_datafile_instance( perm: str, checksum: str = "", ) -> str: - return load_single_datafile_template().render( + return load_datafilelist_item_schema_template().render( file_absolute_path=file_absolute_path, file_size=file_size, datetime_isoformat=datetime_isoformat, @@ -167,7 +168,7 @@ def build_single_data_file_desc( """ Build the description of a single data file. """ - single_file_template = load_single_datafile_template() + single_file_template = load_datafilelist_item_schema_template() return json.loads( single_file_template.render( diff --git a/src/scicat_kafka.py b/src/scicat_kafka.py index b09454f..8c47a8a 100644 --- a/src/scicat_kafka.py +++ b/src/scicat_kafka.py @@ -5,7 +5,7 @@ from collections.abc import Generator from confluent_kafka import Consumer -from scicat_configuration import MessageSavingOptions, kafkaOptions +from scicat_configuration import KafkaOptions from streaming_data_types import deserialise_wrdn from streaming_data_types.finished_writing_wrdn import ( FILE_IDENTIFIER as WRDN_FILE_IDENTIFIER, @@ -13,7 +13,7 @@ from streaming_data_types.finished_writing_wrdn import WritingFinished -def collect_consumer_options(options: kafkaOptions) -> dict: +def collect_consumer_options(options: KafkaOptions) -> dict: """Build a Kafka consumer and configure it according to the ``options``.""" from dataclasses import asdict @@ -35,7 +35,7 @@ def collect_consumer_options(options: kafkaOptions) -> dict: return config_dict -def collect_kafka_topics(options: kafkaOptions) -> list[str]: +def collect_kafka_topics(options: KafkaOptions) -> list[str]: """Return the Kafka topics as a list.""" if isinstance(options.topics, str): return options.topics.split(",") @@ -45,7 +45,7 @@ def collect_kafka_topics(options: kafkaOptions) -> list[str]: raise TypeError("The topics must be a list or a comma-separated string.") -def build_consumer(kafka_options: kafkaOptions, logger: logging.Logger) -> Consumer: +def build_consumer(kafka_options: KafkaOptions, logger: logging.Logger) -> Consumer: """Build a Kafka consumer and configure it according to the ``options``.""" consumer_options = collect_consumer_options(kafka_options) logger.info("Connecting to Kafka with the following parameters:") diff --git a/src/scicat_logging.py b/src/scicat_logging.py index 9c7e089..d9ff480 100644 --- a/src/scicat_logging.py +++ b/src/scicat_logging.py @@ -5,11 +5,11 @@ import logging.handlers import graypy -from scicat_configuration import OnlineIngestorConfig, OfflineIngestorConfig +from scicat_configuration import OfflineIngestorConfig, OnlineIngestorConfig def build_logger( - config: OnlineIngestorConfig | OfflineIngestorConfig + config: OnlineIngestorConfig | OfflineIngestorConfig, ) -> logging.Logger: """Build a logger and configure it according to the ``config``.""" logging_options = config.logging diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 1e8cf85..ccf0d81 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -6,10 +6,11 @@ import hashlib import json import logging +import os import pathlib import uuid +from typing import Any from urllib.parse import urljoin -import os import h5py import pytz @@ -24,8 +25,12 @@ ) from scicat_logging import build_logger from scicat_metadata import collect_schemas, select_applicable_schema -from src.scicat_path_helpers import compose_ingestor_directory, compose_ingestor_output_file_path -from system_helpers import offline_ingestor_exit_at_exceptions, exit +from system_helpers import exit, offline_ingestor_exit_at_exceptions + +from src.scicat_path_helpers import ( + compose_ingestor_directory, + compose_ingestor_output_file_path, +) def replace_variables_values(url: str, values: dict) -> str: @@ -78,6 +83,7 @@ def extract_variables_values( return values + def _new_hash(algorithm: str) -> Any: try: return hashlib.new(algorithm, usedforsecurity=False) @@ -101,11 +107,7 @@ def _compute_file_checksum(file_full_path: pathlib.Path, algorithm: str) -> str: return chk.hexdigest() # type: ignore[no-any-return] -def _create_datafilelist_item( - file_full_path: pathlib.Path, - config, - logger -): +def _create_datafilelist_item(file_full_path: pathlib.Path, config, logger): """ Create the matching entry in the datafiles list for the file provided :param file_full_path: @@ -113,67 +115,75 @@ def _create_datafilelist_item( :param logger: :return: """ - logger.info("create_datafilelist_item: adding file {}".format(file_full_path)) + logger.info("create_datafilelist_item: adding file %s", file_full_path.absolute()) datafilelist_item = { "path": file_full_path, "size": 0, - "time": datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.000Z"), + "time": datetime.datetime.now(tz=datetime.UTC).strftime( + "%Y-%m-%dT%H:%M:%S.000Z" + ), } if config.ingestion.compute_files_stats and file_full_path.exists(): logger.info("create_datafilelist_item: reading file stats from disk") stats = file_full_path.stat() - datafiles_item = { + datafilelist_item = { **datafilelist_item, **{ "size": stats.st_size, - "time": datetime.datetime.fromtimestamp(stats.st_ctime, tz=pytz.utc).strftime("%Y-%m-%dT%H:%M:%S.000Z"), + "time": datetime.datetime.fromtimestamp( + stats.st_ctime, tz=pytz.utc + ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), "uid": stats.st_uid, "gid": stats.st_gid, "perm": stats.st_mode, - } + }, } return datafilelist_item + def _compute_file_checksum_if_needed( - file_full_path: pathlib.Path, - ingestor_directory: pathlib.Path, - config, - logger + file_full_path: pathlib.Path, ingestor_directory: pathlib.Path, config, logger ): checksum = "" datafiles_item = {} if config.ingestion.compute_files_hash and os.path.exists(file_full_path): logger.info("create_datafiles_entry: computing hash of the file from disk") - checksum = _compute_file_checksum(file_full_path, config.ingestion.file_hash_algorithm) + checksum = _compute_file_checksum( + file_full_path, config.ingestion.file_hash_algorithm + ) if config.ingstion.save_hash_in_file: - # file path for hash file hash_file_full_path = compose_ingestor_output_file_path( ingestor_directory, file_full_path.stem, - config.ingestion.hash_file_extension) - logger.info("create_datafiles_entry: saving hash in file {}".format(hash_file_full_path)) + config.ingestion.hash_file_extension, + ) + logger.info( + "create_datafiles_entry: saving hash in file %s", hash_file_full_path + ) # save hash in file with hash_file_full_path.open('w') as fh: fh.write(datafiles_item['chk']) - datafiles_item = _create_datafilelist_item(hash_file_full_path,config,logger) + datafiles_item = _create_datafilelist_item( + hash_file_full_path, config, logger + ) return checksum, datafiles_item def _create_datafiles_list( - nexus_file_path: pathlib.Path, - done_writing_message_file_path: pathlib.Path, - ingestor_directory: pathlib.Path, - config, - logger + nexus_file_path: pathlib.Path, + done_writing_message_file_path: pathlib.Path, + ingestor_directory: pathlib.Path, + config, + logger, ) -> list: """ Update the file size and creation time according to the configuration @@ -184,30 +194,29 @@ def _create_datafiles_list( :return: """ - logger.info("create_datafiles_list: adding nexus file {}".format(nexus_file_path)) - datafiles_list = [ - _create_datafilelist_item(nexus_file_path, config, logger) - ] + logger.info( + "create_datafiles_list: adding nexus file %s", nexus_file_path.absolute() + ) + datafiles_list = [_create_datafilelist_item(nexus_file_path, config, logger)] checksum, datafiles_hash_item = _compute_file_checksum_if_needed( - nexus_file_path, - ingestor_directory, - config, - logger) + nexus_file_path, ingestor_directory, config, logger + ) if checksum: datafiles_list[0]['chk'] = checksum if datafiles_hash_item: datafiles_list.append(datafiles_hash_item) if config.ingestion.file_handling.message_to_file: - logger.info("create_datafiles_list: adding done writing message file {}".format(done_writing_message_file_path)) + logger.info( + "create_datafiles_list: adding done writing message file %s", + done_writing_message_file_path.absolute(), + ) datafiles_list.append( _create_datafilelist_item(done_writing_message_file_path, config, logger) ) checksum, datafiles_hash_item = _compute_file_checksum_if_needed( - nexus_file_path, - ingestor_directory, - config, - logger) + nexus_file_path, ingestor_directory, config, logger + ) if checksum: datafiles_list[-1]['chk'] = checksum if datafiles_hash_item: @@ -215,12 +224,9 @@ def _create_datafiles_list( return datafiles_list + def _prepare_scicat_dataset( - metadata_schema: dict, - values: dict, - datafilelist: list[dict], - config, - logger + metadata_schema: dict, values: dict, datafilelist: list[dict], config, logger ): """ Prepare scicat dataset as dictionary ready to be ``POST``ed. @@ -269,34 +275,42 @@ def _prepare_scicat_dataset( dataset["pid"] = str(uuid.uuid4()) if "instrumentId" not in dataset.keys() or not dataset["instrumentId"]: - logger.info("_prepare_scicat_dataset: Assigning default instrument id: {}".format(config.dataset.default_instrument_id)) + logger.info( + "_prepare_scicat_dataset: Assigning default instrument id: %s", + config.dataset.default_instrument_id, + ) dataset["instrumentId"] = config.dataset.default_instrument_id if "proposalId" not in dataset.keys() or not dataset["proposalId"]: - logger.info("_prepare_scicat_dataset: Assigning default proposal id: {}".format(config.dataset.default_proposal_id)) + logger.info( + "_prepare_scicat_dataset: Assigning default proposal id: %s", + config.dataset.default_proposal_id, + ) dataset["proposalId"] = config.dataset.default_proposal_id if "ownerGroup" not in dataset.keys() or not dataset["ownerGroup"]: - logger.info("_prepare_scicat_dataset: Assigning default ownerGroup: {}".format(config.dataset.default_owner_group)) + logger.info( + "_prepare_scicat_dataset: Assigning default ownerGroup: %s", + config.dataset.default_owner_group, + ) dataset["ownerGroup"] = config.dataset.default_owner_group if "accessGroups" not in dataset.keys() or not dataset["accessGroups"]: - logger.info("_prepare_scicat_dataset: Assigning default accessGroups: {}".format(json.dumps(config.dataset.default_access_groups))) + logger.info( + "_prepare_scicat_dataset: Assigning default accessGroups: %s", + json.dumps(config.dataset.default_access_groups), + ) dataset["accessGroups"] = config.dataset.default_access_groups dataset["size"] = len(datafilelist) dataset["numberOfFiles"] = sum([item["size"] for item in datafilelist]) dataset["isPublished"] = False - logger.info("_prepare_scicat_dataset: Scicat dataset: {}".format(json.dumps(dataset))) + logger.info("_prepare_scicat_dataset: Scicat dataset: %s", json.dumps(dataset)) return dataset -def _create_scicat_dataset( - dataset: dict, - config, - logger: logging.Logger -) -> dict: +def _create_scicat_dataset(dataset: dict, config, logger: logging.Logger) -> dict: """ Execute a POST request to scicat to create a dataset """ @@ -314,23 +328,25 @@ def _create_scicat_dataset( result = response.json() if not response.ok: err = result.get("error", {}) - logger.info(f"_create_scicat_dataset: Failed to create new dataset. Error {err}") + logger.error( + "_create_scicat_dataset: Failed to create new dataset. Error %s", err + ) raise Exception(f"Error creating new dataset: {err}") - logger.info("_create_scicat_dataset: Dataset created successfully. Dataset pid: %s", result['pid']) + logger.info( + "_create_scicat_dataset: Dataset created successfully. Dataset pid: %s", + result['pid'], + ) return result -def _prepare_scicat_origdatablock( - scicat_dataset, - datafilelist, - config, - logger -): +def _prepare_scicat_origdatablock(scicat_dataset, datafilelist, config, logger): """ Create local copy of the orig datablock to send to scicat """ - logger.info("_prepare_scicat_origdatablock: Preparing scicat origdatablock structure") + logger.info( + "_prepare_scicat_origdatablock: Preparing scicat origdatablock structure" + ) origdatablock = { "ownerGroup": scicat_dataset["ownerGroup"], "accessGroups": scicat_dataset["accessGroups"], @@ -340,19 +356,22 @@ def _prepare_scicat_origdatablock( "datasetId": scicat_dataset["pid"], } - logger.info("_prepare_scicat_origdatablock: Scicat origdatablock: {}".format(json.dumps(origdatablock))) + logger.info( + "_prepare_scicat_origdatablock: Scicat origdatablock: %s", + json.dumps(origdatablock), + ) return origdatablock def _create_scicat_origdatablock( - origdatablock: dict, - config, - logger: logging.Logger + origdatablock: dict, config, logger: logging.Logger ) -> dict: """ Execute a POST request to scicat to create a new origdatablock """ - logger.info("_create_scicat_origdatablock: Sending POST request to create new origdatablock") + logger.info( + "_create_scicat_origdatablock: Sending POST request to create new origdatablock" + ) response = requests.request( method="POST", url=urljoin(config.scicat.host, "origdatablocks"), @@ -366,43 +385,51 @@ def _create_scicat_origdatablock( result = response.json() if not response.ok: err = result.get("error", {}) - logger.info(f"_create_scicat_origdatablock: Failed to create new origdatablock. Error {err}") + logger.error( + "_create_scicat_origdatablock: Failed to create new origdatablock." + "Error %s", + err, + ) raise Exception(f"Error creating new origdatablock: {err}") - logger.info("_create_scicat_origdatablock: Origdatablock created successfully. Origdatablock pid: %s", result['_id']) + logger.info( + "_create_scicat_origdatablock: Origdatablock created successfully. " + "Origdatablock pid: %s", + result['_id'], + ) return result -def _define_dataset_source_folder( - datafilelist -) -> pathlib.Path: +def _define_dataset_source_folder(datafilelist) -> pathlib.Path: """ - Return the dataset source folder, which is the common path between all the data files associated with the dataset + Return the dataset source folder, which is the common path + between all the data files associated with the dataset """ - return pathlib.Path( os.path.commonpath( [item["path"] for item in datafilelist])) + return pathlib.Path(os.path.commonpath([item["path"] for item in datafilelist])) def _path_to_relative( - datafilelist_item: dict, - dataset_source_folder: pathlib.Path + datafilelist_item: dict, dataset_source_folder: pathlib.Path ) -> dict: """ - Copy the datafiles item and transform the path to the relative path to the dataset source folder + Copy the datafiles item and transform the path to the relative path + to the dataset source folder """ origdatablock_datafilelist_item = copy.deepcopy(datafilelist_item) - origdatablock_datafilelist_item["path"] = str(datafilelist_item["path"].to_relative(dataset_source_folder)) + origdatablock_datafilelist_item["path"] = str( + datafilelist_item["path"].to_relative(dataset_source_folder) + ) return origdatablock_datafilelist_item def _prepare_origdatablock_datafilelist( - datafiles_list: list, - dataset_source_folder: pathlib.Path + datafiles_list: list, dataset_source_folder: pathlib.Path ) -> list: """ Prepare the datafiles list for the origdatablock entry in scicat That means that the file paths needs to be relative to the dataset source folder """ - return [_path_to_relative(item,dataset_source_folder) for item in datafiles_list] + return [_path_to_relative(item, dataset_source_folder) for item in datafiles_list] def main() -> None: @@ -411,7 +438,6 @@ def main() -> None: arg_namespace = arg_parser.parse_args() config = build_scicat_offline_ingestor_config(arg_namespace) ingestion_options = config.ingestion - file_handling_options = ingestion_options.file_handling logger = build_logger(config) # Log the configuration as dictionary so that it is easier to read from the logs @@ -432,19 +458,20 @@ def main() -> None: done_writing_message_file_path = pathlib.Path() if config.ingestion.file_handling.message_to_file: done_writing_message_file_path = pathlib.Path( - config.offline_run.done_writing_message_file) + config.offline_run.done_writing_message_file + ) logger.info( "Done writing message file linked to nexus file : %s", - done_writing_message_file_path + done_writing_message_file_path, ) # log done writing message input file logger.info(json.load(done_writing_message_file_path.open())) - # define which is the directory where the ingestor should save the files it creates, if any is created + # define which is the directory where the ingestor should save + # the files it creates, if any is created ingestor_directory = compose_ingestor_directory( - config.ingestion.file_handling, - nexus_file_path + config.ingestion.file_handling, nexus_file_path ) # open nexus file with h5py @@ -485,48 +512,32 @@ def main() -> None: done_writing_message_file_path, ingestor_directory, config, - logger + logger, ) - dataset_source_folder = _define_dataset_source_folder( - datafilelist - ) + dataset_source_folder = _define_dataset_source_folder(datafilelist) origdatablock_datafiles_list = _prepare_origdatablock_datafilelist( - datafilelist, - dataset_source_folder + datafilelist, dataset_source_folder ) # create and populate scicat dataset entry local_dataset = _prepare_scicat_dataset( - metadata_schema, - variables_values, - datafilelist, - config, - logger + metadata_schema, variables_values, datafilelist, config, logger ) # create dataset in scicat - scicat_dataset = _create_scicat_dataset( - local_dataset, - config, - logger - ) + scicat_dataset = _create_scicat_dataset(local_dataset, config, logger) # create and populate scicat origdatablock entry # with files and hashes previously computed local_origdatablock = _prepare_scicat_origdatablock( - scicat_dataset, - origdatablock_datafiles_list, - config, - logger + scicat_dataset, origdatablock_datafiles_list, config, logger ) # create origdatablock in scicat scicat_origdatablock = _create_scicat_origdatablock( - local_origdatablock, - config, - logger + local_origdatablock, config, logger ) # check one more time if we successfully created the entries in scicat diff --git a/src/scicat_path_helpers.py b/src/scicat_path_helpers.py index 3635323..df52782 100644 --- a/src/scicat_path_helpers.py +++ b/src/scicat_path_helpers.py @@ -6,16 +6,19 @@ def compose_ingestor_directory( - fh_options: FileHandlingOptions, - nexus_file_path: str | pathlib.Path + fh_options: FileHandlingOptions, nexus_file_path: str | pathlib.Path ) -> pathlib.Path: """Select the ingestor directory based on the file path and the options.""" directory = pathlib.Path(fh_options.ingestor_files_directory) - nexus_file_path = pathlib.Path(nexus_file_path) if isinstance(nexus_file_path,str) else nexus_file_path + nexus_file_path = ( + pathlib.Path(nexus_file_path) + if isinstance(nexus_file_path, str) + else nexus_file_path + ) if directory.is_absolute(): return directory else: - directory = nexus_file_path.parents[0] / directory + directory = nexus_file_path.parents[0] / directory return directory.resolve() diff --git a/src/scicat_schemas/load_template.py b/src/scicat_schemas/load_template.py index abca2ae..e2fb3d5 100644 --- a/src/scicat_schemas/load_template.py +++ b/src/scicat_schemas/load_template.py @@ -5,7 +5,9 @@ from jinja2 import Template _CUR_DIR = pathlib.Path(__file__).parent -_DATAFILELIST_ITEM_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path("datafilelist_item.schema.json.jinja") +_DATAFILELIST_ITEM_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path( + "datafilelist_item.schema.json.jinja" +) _DATASET_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path("dataset.schema.json.jinja") _ORIGDATABLOCK_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path( "origdatablock.schema.json.jinja" diff --git a/src/system_helpers.py b/src/system_helpers.py index c9e1aa4..53ce9b0 100644 --- a/src/system_helpers.py +++ b/src/system_helpers.py @@ -32,9 +32,10 @@ def online_ingestor_exit_at_exceptions( logger.info("Finished successfully.") exit(logger, unexpected=False) + @contextmanager def offline_ingestor_exit_at_exceptions( - logger: logging.Logger + logger: logging.Logger, ) -> Generator[None, None, None]: """ manage exceptions specifically for offline ingestor diff --git a/tests/test_scicat_schema.py b/tests/test_scicat_schema.py index 33f23dd..bc98c84 100644 --- a/tests/test_scicat_schema.py +++ b/tests/test_scicat_schema.py @@ -83,7 +83,7 @@ def test_dataset_schema_rendering() -> None: from scicat_dataset import build_dataset_instance - dataset_schema = build_dataset_description( + dataset_schema = build_dataset_instance( dataset_pid_prefix="12.234.34567", nxs_dataset_pid="e3690b21-ee8c-40d6-9409-6b6fdca776d2", dataset_name="this is a dataset", @@ -130,7 +130,7 @@ def test_single_file_description_rendering() -> None: from scicat_dataset import build_single_datafile_instance - file_description = build_single_datafile_description( + file_description = build_single_datafile_instance( file_absolute_path="/ess/data/coda/2024/616254/0001.nxs", file_size=1231231, datetime_isoformat="2024-07-16T10:00:00.000Z", @@ -160,7 +160,7 @@ def test_single_file_description_rendering_no_checksum() -> None: from scicat_dataset import build_single_datafile_instance - file_description = build_single_datafile_description( + file_description = build_single_datafile_instance( file_absolute_path="/ess/data/coda/2024/616254/0002.nxs", file_size=1231231, datetime_isoformat="2024-07-16T10:00:00.000Z", @@ -209,9 +209,9 @@ def test_single_file_description_rendering_no_checksum() -> None: def test_orig_datablock_rendering() -> None: import json - from scicat_dataset import build_orig_datablock_instance + from scicat_dataset import build_origdatablock_instance - orig_datablock = build_orig_datablock_description( + orig_datablock = build_origdatablock_instance( dataset_pid_prefix="20.500.12269", nxs_dataset_pid="53fd2786-3729-11ef-83e5-fa163e9aae0a", dataset_size=446630741, From 14b66014bcd63381d2604587e9da28b80b405f44 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Mon, 5 Aug 2024 21:57:47 +0200 Subject: [PATCH 07/25] Update package import test and temporarily remove the option tests. --- tests/minimum_test.py | 3 +- tests/test_scicat_configuration.py | 113 ----------------------------- 2 files changed, 2 insertions(+), 114 deletions(-) delete mode 100644 tests/test_scicat_configuration.py diff --git a/tests/minimum_test.py b/tests/minimum_test.py index 9b65be5..0935eef 100644 --- a/tests/minimum_test.py +++ b/tests/minimum_test.py @@ -1,2 +1,3 @@ def test_package() -> None: - import scicat_ingestor # noqa: F401 + import scicat_offline_ingestor # noqa: F401 + import scicat_online_ingestor # noqa: F401 diff --git a/tests/test_scicat_configuration.py b/tests/test_scicat_configuration.py deleted file mode 100644 index 5bd8a08..0000000 --- a/tests/test_scicat_configuration.py +++ /dev/null @@ -1,113 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) -import argparse - -import pytest -from scicat_configuration import IngesterConfig - - -@pytest.fixture() -def main_arg_parser() -> argparse.ArgumentParser: - """Return the namespace of the main argument parser.""" - from scicat_configuration import build_main_arg_parser - - return build_main_arg_parser() - - -def test_scicat_arg_parser_configuration_matches( - main_arg_parser: argparse.ArgumentParser, -) -> None: - """Test if options in the configuration file matches the argument parser.""" - import json - import pathlib - - scicat_namespace = main_arg_parser.parse_args( - ['-c', 'resources/config.sample.json'] - ) - - # Check if the configuration file is the same - assert scicat_namespace.config_file == 'resources/config.sample.json' - config_path = pathlib.Path(scicat_namespace.config_file) - config_from_args: dict = vars(scicat_namespace) - - # Parse the configuration file - assert config_path.exists() - config_from_file: dict = json.loads(config_path.read_text()) - main_options: dict = config_from_file.get('options', {}) - - # Check if all keys matches - all_keys = set(config_from_args.keys()).union(main_options.keys()) - for key in all_keys: - assert key in config_from_args - assert key in main_options - - -def test_build_scicat_config_default(main_arg_parser: argparse.ArgumentParser) -> None: - """Test if the configuration can be built from default arguments.""" - from scicat_configuration import build_scicat_ingester_config - - scicat_namespace = main_arg_parser.parse_args() - scicat_config = build_scicat_ingester_config(scicat_namespace) - assert scicat_config.run_options.config_file == 'config.20240405.json' - - -@pytest.fixture() -def ingester_config(main_arg_parser: argparse.ArgumentParser) -> IngesterConfig: - from scicat_configuration import build_scicat_ingester_config - - scicat_namespace = main_arg_parser.parse_args( - ['-c', 'resources/config.sample.json', '--verbose'] - ) - return build_scicat_ingester_config(scicat_namespace) - - -def test_build_scicat_config(ingester_config: IngesterConfig) -> None: - """Test if the configuration can be built from arguments.""" - assert ingester_config.original_dict['options']['config_file'] == 'config.json' - assert ingester_config.run_options.config_file == 'resources/config.sample.json' - assert not ingester_config.original_dict['options']['verbose'] - assert ingester_config.run_options.verbose - - -def test_scicat_config_original_dict_read_only(ingester_config: IngesterConfig) -> None: - """Test if the original dictionary is read-only.""" - from types import MappingProxyType - - assert isinstance(ingester_config.original_dict, MappingProxyType) - for sub_option in ingester_config.original_dict.values(): - assert isinstance(sub_option, MappingProxyType) - - -def test_scicat_config_kafka_options(ingester_config: IngesterConfig) -> None: - """Test if the Kafka options are correctly read.""" - assert ingester_config.kafka_options.topics == ["KAFKA_TOPIC_1", "KAFKA_TOPIC_2"] - assert ingester_config.kafka_options.enable_auto_commit - - -def test_scicat_background_config_single_run_option() -> None: - """Test if the single run options are correctly read.""" - from scicat_configuration import ( - build_background_ingestor_arg_parser, - build_scicat_background_ingester_config, - ) - - arg_parser = build_background_ingestor_arg_parser() - scicat_namespace = arg_parser.parse_args( - [ - '-c', - 'resources/config.sample.json', - '--verbose', - '--nexus-file', - 'file.nxs', - '--done-writing-message-file', - 'file.json', - ] - ) - background_ingester_config = build_scicat_background_ingester_config( - scicat_namespace - ) - assert background_ingester_config.single_run_options.nexus_file == 'file.nxs' - assert ( - background_ingester_config.single_run_options.done_writing_message_file - == 'file.json' - ) From 8ca1c83e3429975b68cda10b764cba3779858afe Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Mon, 5 Aug 2024 21:59:00 +0200 Subject: [PATCH 08/25] Fix path to the main script. --- pyproject.toml | 4 ++-- tests/test_logging.py | 45 ------------------------------------------- 2 files changed, 2 insertions(+), 47 deletions(-) delete mode 100644 tests/test_logging.py diff --git a/pyproject.toml b/pyproject.toml index 4d69d47..75233ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,8 +46,8 @@ dynamic = ["version"] "Source" = "https://github.com/ScicatProject/scicat-filewriter-ingest" [project.scripts] -scicat_ingestor = "scicat_ingestor:main" -background_ingestor = "background_ingestor:main" +scicat_ingestor = "scicat_online_ingestor:main" +background_ingestor = "scicat_offline_ingestor:main" [project.entry-points."scicat_ingestor.metadata_extractor"] max = "numpy:max" diff --git a/tests/test_logging.py b/tests/test_logging.py deleted file mode 100644 index 8e30445..0000000 --- a/tests/test_logging.py +++ /dev/null @@ -1,45 +0,0 @@ -import pathlib - -import pytest -from scicat_configuration import ( - DatasetOptions, - FileHandlingOptions, - GraylogOptions, - IngesterConfig, - IngestionOptions, - RunOptions, - kafkaOptions, -) - - -@pytest.fixture() -def scicat_config(tmp_path: pathlib.Path) -> IngesterConfig: - return IngesterConfig( - original_dict={}, - run_options=RunOptions( - config_file='test', - verbose=True, - file_log=True, - file_log_base_name=(tmp_path / pathlib.Path('test')).as_posix(), - file_log_timestamp=True, - system_log=False, - system_log_facility=None, - log_message_prefix='test', - logging_level='DEBUG', - check_by_job_id=True, - pyscicat='test', - ), - kafka_options=kafkaOptions(), - graylog_options=GraylogOptions(), - ingestion_options=IngestionOptions( - file_handling_options=FileHandlingOptions(), - dataset_options=DatasetOptions(), - ), - ) - - -def test_scicat_logging_build_logger(scicat_config: IngesterConfig) -> None: - from scicat_logging import build_logger - - logger = build_logger(scicat_config) - assert len(logger.handlers) == 2 # FileHandler and StreamHandler From d339a45f595dc192ef24edecbd0c54438ff7934d Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 08:10:34 +0200 Subject: [PATCH 09/25] Use datetime instead pytz --- src/scicat_offline_ingestor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index ccf0d81..d52147b 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -13,7 +13,6 @@ from urllib.parse import urljoin import h5py -import pytz import requests from scicat_configuration import ( OfflineIngestorConfig, @@ -133,7 +132,7 @@ def _create_datafilelist_item(file_full_path: pathlib.Path, config, logger): **{ "size": stats.st_size, "time": datetime.datetime.fromtimestamp( - stats.st_ctime, tz=pytz.utc + stats.st_ctime, tz=datetime.UTC ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), "uid": stats.st_uid, "gid": stats.st_gid, From 0617eb1eb3a43964262d88af9a4b6a97b4702871 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 08:20:32 +0200 Subject: [PATCH 10/25] Remove schema related tests before updating. --- tests/test_scicat_schema.py | 226 ------------------------------------ 1 file changed, 226 deletions(-) delete mode 100644 tests/test_scicat_schema.py diff --git a/tests/test_scicat_schema.py b/tests/test_scicat_schema.py deleted file mode 100644 index bc98c84..0000000 --- a/tests/test_scicat_schema.py +++ /dev/null @@ -1,226 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) - - -def test_single_datafile_template_loading() -> None: - from scicat_schemas.load_template import load_single_datafile_template - - assert load_single_datafile_template() is not None - - -def test_dataset_schema_template_loading() -> None: - from scicat_schemas.load_template import load_dataset_schema_template - - assert load_dataset_schema_template() is not None - - -def test_origdatablock_schema_template_loading() -> None: - from scicat_schemas.load_template import load_origdatablock_schema_template - - assert load_origdatablock_schema_template() is not None - - -_example_scientific_metadata = """"run_number": { - "value": 18856, - "unit": "", - "human_name": "Run Number", - "type": "integer" - }, - "sample_temperature": { - "value": 20.4, - "unit": "C", - "human_name": "Sample Temperature", - "type": "quantity" - }, - "start_time" : { - "value" : "2024-07-16T09:30:12.987Z", - "unit" : "", - "human_name" : "Start Time", - "type" : "date" - }""" - -_example_dataset_schema = ( - """ -{ - "pid": "12.234.34567/e3690b21-ee8c-40d6-9409-6b6fdca776d2", - "datasetName": "this is a dataset", - "description": "this is the description of the dataset", - "principalInvestigator": "Somebodys Name", - "creationLocation": "ESS:CODA", - "scientificMetadata": { - """ - + _example_scientific_metadata - + """ - }, - "owner": "Somebodys Name", - "ownerEmail": "someones_@_email", - "sourceFolder": "/ess/data/coda/2024/616254", - "contactEmail": "someones_@_email", - "creationTime": "2024-07-16T10:00:00.000Z", - "type": "raw", - "techniques": [ - { - "pid": "someprotocol://someones/url/and/id", - "names": "absorption and phase contrast nanotomography" - } - ], - "instrumentId": "12.234.34567/765b3dc3-f658-410e-b371-04dd1adcd520", - "sampleId": "bd31725a-dbfd-4c32-87db-1c1ebe61e5ca", - "proposalId": "616254", - "ownerGroup": "ess_proposal_616254", - "accessGroups": [ - "scientific information management systems group", - "scicat group" - ] -} - -""" -) - - -def test_dataset_schema_rendering() -> None: - import json - - from scicat_dataset import build_dataset_instance - - dataset_schema = build_dataset_instance( - dataset_pid_prefix="12.234.34567", - nxs_dataset_pid="e3690b21-ee8c-40d6-9409-6b6fdca776d2", - dataset_name="this is a dataset", - dataset_description="this is the description of the dataset", - principal_investigator="Somebodys Name", - facility="ESS", - environment="CODA", - scientific_metadata=_example_scientific_metadata, - owner="Somebodys Name", - owner_email="someones_@_email", - source_folder="/ess/data/coda/2024/616254", - contact_email="someones_@_email", - iso_creation_time="2024-07-16T10:00:00.000Z", - technique_pid="someprotocol://someones/url/and/id", - technique_name="absorption and phase contrast nanotomography", - instrument_id="12.234.34567/765b3dc3-f658-410e-b371-04dd1adcd520", - sample_id="bd31725a-dbfd-4c32-87db-1c1ebe61e5ca", - proposal_id="616254", - owner_group="ess_proposal_616254", - access_groups=[ - "scientific information management systems group", - "scicat group", - ], - ) - - assert json.loads(dataset_schema) == json.loads(_example_dataset_schema) - - -_example_file_description_1 = """ -{ - "path": "/ess/data/coda/2024/616254/0001.nxs", - "size": 1231231, - "time": "2024-07-16T10:00:00.000Z", - "chk": "1234567890abcdef", - "uid": "1004", - "gid": "1005", - "perm": "33188" -} -""" - - -def test_single_file_description_rendering() -> None: - import json - - from scicat_dataset import build_single_datafile_instance - - file_description = build_single_datafile_instance( - file_absolute_path="/ess/data/coda/2024/616254/0001.nxs", - file_size=1231231, - datetime_isoformat="2024-07-16T10:00:00.000Z", - checksum="1234567890abcdef", - uid="1004", - gid="1005", - perm="33188", - ) - - assert json.loads(file_description) == json.loads(_example_file_description_1) - - -_example_file_description_2 = """ -{ - "path": "/ess/data/coda/2024/616254/0002.nxs", - "size": 1231231, - "time": "2024-07-16T10:00:00.000Z", - "uid": "1004", - "gid": "1005", - "perm": "33188" -} -""" - - -def test_single_file_description_rendering_no_checksum() -> None: - import json - - from scicat_dataset import build_single_datafile_instance - - file_description = build_single_datafile_instance( - file_absolute_path="/ess/data/coda/2024/616254/0002.nxs", - file_size=1231231, - datetime_isoformat="2024-07-16T10:00:00.000Z", - uid="1004", - gid="1005", - perm="33188", - ) - - assert json.loads(file_description) == json.loads(_example_file_description_2) - - -_example_file_description_3 = """ -{ - "path": "/ess/data/coda/2024/616254/0003.nxs", - "size": 1231231, - "time": "2024-07-16T10:00:00.000Z", - "chk": "1234567890abcdef", - "uid": "1004", - "gid": "1005", - "perm": "33188" -} -""" - -_example_orig_datablock = ( - """ -{ - "datasetId": "20.500.12269/53fd2786-3729-11ef-83e5-fa163e9aae0a", - "size": 446630741, - "chkAlg": "blake2b", - "dataFileList": [ - """ - + _example_file_description_1 - + """, - """ - + _example_file_description_2 - + """, - """ - + _example_file_description_3 - + """ - ] -} -""" -) - - -def test_orig_datablock_rendering() -> None: - import json - - from scicat_dataset import build_origdatablock_instance - - orig_datablock = build_origdatablock_instance( - dataset_pid_prefix="20.500.12269", - nxs_dataset_pid="53fd2786-3729-11ef-83e5-fa163e9aae0a", - dataset_size=446630741, - check_algorithm="blake2b", - data_file_desc_list=[ - _example_file_description_1, - _example_file_description_2, - _example_file_description_3, - ], - ) - - assert json.loads(orig_datablock) == json.loads(_example_orig_datablock) From caf91ef62fee95e3e4a3a6a1a46e37052798bd60 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 08:20:53 +0200 Subject: [PATCH 11/25] Update import path. --- src/scicat_dataset.py | 4 ++-- src/scicat_offline_ingestor.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index 8c67d14..ed22072 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -205,9 +205,9 @@ def save_and_build_single_hash_file_desc( import datetime import json - from scicat_schemas import load_single_datafile_template + from scicat_schemas import load_datafilelist_item_schema_template - single_file_template = load_single_datafile_template() + single_file_template = load_datafilelist_item_schema_template() file_hash: str = original_file_desciption["chk"] hash_path = _build_hash_file_path( original_file_path=original_file_desciption["path"], diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index d52147b..e750aba 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -24,12 +24,11 @@ ) from scicat_logging import build_logger from scicat_metadata import collect_schemas, select_applicable_schema -from system_helpers import exit, offline_ingestor_exit_at_exceptions - -from src.scicat_path_helpers import ( +from scicat_path_helpers import ( compose_ingestor_directory, compose_ingestor_output_file_path, ) +from system_helpers import exit, offline_ingestor_exit_at_exceptions def replace_variables_values(url: str, values: dict) -> str: From e44714d7e872760b676803dc61bd9132f89d1f97 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 09:10:49 +0200 Subject: [PATCH 12/25] Log why we use dataclass instead of jinja or dict. --- README.md | 45 +++++++++++++++++++ .../datafilelist_item.schema.json.jinja | 9 ---- src/scicat_schemas/dataset.schema.json.jinja | 30 ------------- .../origdatablock.schema.json.jinja | 9 ---- 4 files changed, 45 insertions(+), 48 deletions(-) delete mode 100644 src/scicat_schemas/datafilelist_item.schema.json.jinja delete mode 100644 src/scicat_schemas/dataset.schema.json.jinja delete mode 100644 src/scicat_schemas/origdatablock.schema.json.jinja diff --git a/README.md b/README.md index 3b44d0c..a6f6696 100644 --- a/README.md +++ b/README.md @@ -87,3 +87,48 @@ copier update `tox` controls virtual environment and commands for various purposes. Developers and CI actions can use the command. For example, `tox -e docs` builds documentation under `./html` directory and `tox -e py310` will run unit tests with python version `3.10`. + +## ADR +(Architecture Decision Records) + +### ADR-001: Use ``dataclass`` instead of ``jinja`` or ``dict`` to create dataset/data-block instances. +We need a dict-like template to create dataset/data-block instances via scicat APIs. +#### Reason for not using ``dict`` +It used to be implemented with ``dict`` but it didn't have any verifying layer so anyone could easily break the instances without noticing or causing errors in the upstream layers. +#### Reason for not using ``jinja`` + +``Jinja`` template could handle a bit more complicated logic within the template, i.e. ``for`` loop or ``if`` statement could be applied to the variables. +However, the dataset/data-block instances are not complicated to utilize these features of ``jinja``. +#### Reason for using ``dataclasses.dataclass` +First we did try using ``jinja`` but the dataset/data-block instances are simple enough so we replaced ``jinja`` template with ``dataclass``. +``dataclass`` can verify name and type (if we use static checks) of each field. +It can be easily turned into a nested dictionary using ``dataclasses.asdict`` function. + +#### Downside of using ``dataclass`` instead of ``jinja`` +With ``jinja`` template, certain fields could be skipped based on a variable. +However, it is not possible in the dataclass so it will need extra handling after turning it to a dictionary. +For example, each datafile item can have ``chk`` field, but this field shouldn't exist if checksum was not derived. +With jinja template we could handle this like below +```jinja +{ + "path": "{{ path }}", + "size": {{ size }}, + "time": "{{ time }}", + {% if chk %}"chk": "{{ chk }}"{% endif %} +} +``` +However, with dataclass this should be handled like below. +```python +from dataclasses import dataclass, asdict +@dataclass +class DataFileItem: + path: str + size: int + time: str + chk: None | str = None + +data_file_item = { + k: v if (k!='chk' or v is not None) + for k, v in asdict(DataFileItem('./', 1, '00:00')).items() +} +``` diff --git a/src/scicat_schemas/datafilelist_item.schema.json.jinja b/src/scicat_schemas/datafilelist_item.schema.json.jinja deleted file mode 100644 index 5c0cef6..0000000 --- a/src/scicat_schemas/datafilelist_item.schema.json.jinja +++ /dev/null @@ -1,9 +0,0 @@ -{ - "path": "{{ path }}", - "size": {{ size }}, - "time": "{{ time }}", - {% if chk %}"chk": "{{ chk }}",{% endif %} - {% if uid %}"uid": "{{ uid }}",{% endif %} - {% if gid %}"gid": "{{ gid }}",{% endif %} - {% if perm %}"perm": "{{ perm }}"{% endif %} -} diff --git a/src/scicat_schemas/dataset.schema.json.jinja b/src/scicat_schemas/dataset.schema.json.jinja deleted file mode 100644 index ee8da7c..0000000 --- a/src/scicat_schemas/dataset.schema.json.jinja +++ /dev/null @@ -1,30 +0,0 @@ -{ - "pid": "{{ pid }}", - "datasetName": "{{ datasetName }}", - "description": "{{ datasetDescription }}", - "principalInvestigator": "{{ principalInvestigator }}", - "creationLocation": "{{ creationLocation }}", - "scientificMetadata": { - {{ scientificMetadata }} - }, - "owner": "{{ owner }}", - "ownerEmail": "{{ ownerEmail }}", - "sourceFolder": "{{ sourceFolder }}", - "contactEmail": "{{ contactEmail }}", - "creationTime": "{{ creationTime }}", - "type": "raw", - {% if techniques }{% for technique in techniques } - "techniques": [ - { - "pid": "{{ technique.pid }}", - "names": "{{ technique.name }}" - } - ]{% endif %}{% endfor %}, - "instrumentId": "{{ instrumentId }}", - "sampleId": "{{ sampleId }}", - "proposalId": "{{ proposalId }}", - "ownerGroup": "{{ ownerGroup }}", - "accessGroups": [ - {% for accessGroup in accessGroups %}"{{ accessGroup }}"{% if not loop.last %},{% endif %}{% endfor %} - ] -} diff --git a/src/scicat_schemas/origdatablock.schema.json.jinja b/src/scicat_schemas/origdatablock.schema.json.jinja deleted file mode 100644 index 1038506..0000000 --- a/src/scicat_schemas/origdatablock.schema.json.jinja +++ /dev/null @@ -1,9 +0,0 @@ -{ - "datasetId": "{{ datasetId }}", - "size": {{ size }}, - "chkAlg": "{{ chkAlg }}", - "dataFileList": [ - {% for dataFileList_item in dataFileList %}{{ dataFileList_item }}{% if not loop.last %}, - {% endif %}{% endfor %} - ] -} From 7bbf89cb43a449565e834c158bc0a1c9f8a58b88 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 09:11:56 +0200 Subject: [PATCH 13/25] Remove dependencies related to jinja --- MANIFEST.in | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index f131701..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include src/scicat_schemas/dataset.schema.json.jinja -include src/scicat_schemas/origdatablock.schema.json.jinja From 559137d6d582cdc18a162bc8b9db4d3fe179b4d5 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 09:12:46 +0200 Subject: [PATCH 14/25] Remove jinja template related functions. --- src/scicat_schemas/__init__.py | 9 --------- src/scicat_schemas/load_template.py | 29 ----------------------------- 2 files changed, 38 deletions(-) delete mode 100644 src/scicat_schemas/__init__.py delete mode 100644 src/scicat_schemas/load_template.py diff --git a/src/scicat_schemas/__init__.py b/src/scicat_schemas/__init__.py deleted file mode 100644 index c9c5e87..0000000 --- a/src/scicat_schemas/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) -# ruff: noqa: F401 - -from .load_template import ( - load_datafilelist_item_schema_template, - load_dataset_schema_template, - load_origdatablock_schema_template, -) diff --git a/src/scicat_schemas/load_template.py b/src/scicat_schemas/load_template.py deleted file mode 100644 index e2fb3d5..0000000 --- a/src/scicat_schemas/load_template.py +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) -import pathlib - -from jinja2 import Template - -_CUR_DIR = pathlib.Path(__file__).parent -_DATAFILELIST_ITEM_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path( - "datafilelist_item.schema.json.jinja" -) -_DATASET_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path("dataset.schema.json.jinja") -_ORIGDATABLOCK_SCHEMA_TEMPLATE_PATH = _CUR_DIR / pathlib.Path( - "origdatablock.schema.json.jinja" -) - - -def load_datafilelist_item_schema_template() -> Template: - """Load the template for the single datafile schema.""" - return Template((_DATAFILELIST_ITEM_SCHEMA_TEMPLATE_PATH).read_text()) - - -def load_dataset_schema_template() -> Template: - """Load the template for the dataset schema.""" - return Template((_DATASET_SCHEMA_TEMPLATE_PATH).read_text()) - - -def load_origdatablock_schema_template() -> Template: - """Load the template for the original data block schema.""" - return Template((_ORIGDATABLOCK_SCHEMA_TEMPLATE_PATH).read_text()) From d15fbe635bf77a8527f2d79220c9f3ba805047a8 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 09:13:08 +0200 Subject: [PATCH 15/25] Remove dependencies related to jinja --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 75233ab..8780d37 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,6 @@ dependencies = [ "ess-streaming-data-types", "graypy", "h5py", - "jinja2", "kafka-python", "requests", "rich" From 21c3e0aabbff05d6e6895a2e8814313c37cb7b3e Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 09:23:20 +0200 Subject: [PATCH 16/25] Update example in readme. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a6f6696..4d158ee 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,8 @@ class DataFileItem: chk: None | str = None data_file_item = { - k: v if (k!='chk' or v is not None) + k: v for k, v in asdict(DataFileItem('./', 1, '00:00')).items() + if (k!='chk' or v is not None) } ``` From bb278d8571f2e84aa9104d3034b25dec67424a8b Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 15:03:11 +0200 Subject: [PATCH 17/25] Update data file item creating logic. --- src/scicat_dataset.py | 307 +++++++++++++++++---------------- src/scicat_offline_ingestor.py | 134 ++------------ 2 files changed, 175 insertions(+), 266 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index ed22072..f2790b6 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -1,17 +1,13 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) import datetime -import json +import logging import pathlib +from dataclasses import dataclass from types import MappingProxyType from typing import Any from scicat_configuration import FileHandlingOptions -from scicat_schemas import ( - load_datafilelist_item_schema_template, - load_dataset_schema_template, - load_origdatablock_schema_template, -) def to_string(value: Any) -> str: @@ -58,96 +54,65 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: return converter(input_value) -def build_dataset_instance( - *, - dataset_pid_prefix: str, - nxs_dataset_pid: str, - dataset_name: str, - dataset_description: str, - principal_investigator: str, - facility: str, - environment: str, - scientific_metadata: str, - owner: str, - owner_email: str, - source_folder: str, - contact_email: str, - iso_creation_time: str, - technique_pid: str, - technique_name: str, - instrument_id: str, - sample_id: str, - proposal_id: str, - owner_group: str, - access_groups: list[str], -) -> str: - return load_dataset_schema_template().render( - dataset_pid_prefix=dataset_pid_prefix, - nxs_dataset_pid=nxs_dataset_pid, - dataset_name=dataset_name, - dataset_description=dataset_description, - principal_investigator=principal_investigator, - facility=facility, - environment=environment, - scientific_metadata=scientific_metadata, - owner=owner, - owner_email=owner_email, - source_folder=source_folder, - contact_email=contact_email, - iso_creation_time=iso_creation_time, - technique_pid=technique_pid, - technique_name=technique_name, - instrument_id=instrument_id, - sample_id=sample_id, - proposal_id=proposal_id, - owner_group=owner_group, - access_groups=access_groups, - ) - - -def build_single_datafile_instance( - *, - file_absolute_path: str, - file_size: int, - datetime_isoformat: str, - uid: str, - gid: str, - perm: str, - checksum: str = "", -) -> str: - return load_datafilelist_item_schema_template().render( - file_absolute_path=file_absolute_path, - file_size=file_size, - datetime_isoformat=datetime_isoformat, - checksum=checksum, - uid=uid, - gid=gid, - perm=perm, - ) - - -def build_origdatablock_instance( - *, - dataset_pid_prefix: str, - nxs_dataset_pid: str, - dataset_size: int, - check_algorithm: str, - data_file_desc_list: list[str], -) -> str: - return load_origdatablock_schema_template().render( - dataset_pid_prefix=dataset_pid_prefix, - nxs_dataset_pid=nxs_dataset_pid, - dataset_size=dataset_size, - check_algorithm=check_algorithm, - data_file_desc_list=data_file_desc_list, - ) - - -def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str: +@dataclass(kw_only=True) +class TechniqueDesc: + pid: str + "Technique PID" + names: str + "Technique Name" + + +@dataclass(kw_only=True) +class ScicatDataset: + pid: str + datasetName: str + description: str + principalInvestigator: str + creationLocation: str + scientificMetadata: dict + owner: str + ownerEmail: str + sourceFolder: str + contactEmail: str + creationTime: str + type: str = "raw" + techniques: list[TechniqueDesc] | None = None + instrumentId: str + sampleId: str + proposalId: str + ownerGroup: str + accessGroup: list[str] + + +@dataclass(kw_only=True) +class DataFileListItem: + path: str + "Absolute path to the file." + size: int | None = None + "Size of the single file in bytes." + time: str + chk: str | None = None + uid: str | None = None + gid: str | None = None + perm: str | None = None + + +@dataclass(kw_only=True) +class OrigDataBlockInstance: + datasetId: str + size: int + chkAlg: str + dataFileList: list[DataFileListItem] + + +def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str | None: """Calculate the checksum of a file.""" import hashlib - if not algorithm_name == "b2blake": + if not file_path.exists(): + return None + + if algorithm_name != "b2blake": raise ValueError( "Only b2blake hash algorithm is supported for now. Got: ", f"{algorithm_name}", @@ -162,69 +127,121 @@ def _calculate_checksum(file_path: pathlib.Path, algorithm_name: str) -> str: return chk.hexdigest() -def build_single_data_file_desc( - file_path: pathlib.Path, config: FileHandlingOptions -) -> dict[str, Any]: - """ - Build the description of a single data file. - """ - single_file_template = load_datafilelist_item_schema_template() - - return json.loads( - single_file_template.render( - file_absolute_path=file_path.absolute(), - file_size=(file_stats := file_path.stat()).st_size, - datetime_isoformat=datetime.datetime.fromtimestamp( +def _create_single_data_file_list_item( + *, + file_path: pathlib.Path, + calculate_checksum: bool, + compute_file_stats: bool, + file_hash_algorithm: str = "", +) -> DataFileListItem: + """``DataFileListItem`` constructing helper.""" + + if file_path.exists() and compute_file_stats: + return DataFileListItem( + path=file_path.absolute().as_posix(), + size=(file_stats := file_path.stat()).st_size, + time=datetime.datetime.fromtimestamp( file_stats.st_ctime, tz=datetime.UTC ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), - chk=_calculate_checksum(file_path, config.file_hash_algorithm), + chk=_calculate_checksum(file_path, file_hash_algorithm) + if calculate_checksum + else None, uid=str(file_stats.st_uid), gid=str(file_stats.st_gid), perm=oct(file_stats.st_mode), ) - ) + else: + return DataFileListItem( + path=file_path.absolute().as_posix(), + time=datetime.datetime.now(tz=datetime.UTC).strftime( + "%Y-%m-%dT%H:%M:%S.000Z" + ), + ) -def _build_hash_file_path( +def _build_hash_path( *, - original_file_path: str, - ingestor_files_directory: str, + original_file_instance: DataFileListItem, + dir_path: pathlib.Path, hash_file_extension: str, ) -> pathlib.Path: - """Build the path for the hash file.""" - original_path = pathlib.Path(original_file_path) - dir_path = pathlib.Path(ingestor_files_directory) - file_name = ".".join([original_path.name, hash_file_extension]) - return dir_path / pathlib.Path(file_name) - - -def save_and_build_single_hash_file_desc( - original_file_desciption: dict, config: FileHandlingOptions -) -> dict: - """Save the hash of the file and build the description.""" - import datetime - import json - - from scicat_schemas import load_datafilelist_item_schema_template - - single_file_template = load_datafilelist_item_schema_template() - file_hash: str = original_file_desciption["chk"] - hash_path = _build_hash_file_path( - original_file_path=original_file_desciption["path"], - ingestor_files_directory=config.ingestor_files_directory, - hash_file_extension=config.hash_file_extension, + "Compose path to the hash file." + file_stem = pathlib.Path(original_file_instance.path).stem + return dir_path / pathlib.Path(".".join([file_stem, hash_file_extension])) + + +def _save_hash_file( + *, + original_file_instance: DataFileListItem, + hash_path: pathlib.Path, +) -> None: + """Save the hash of the ``original_file_instance``.""" + if original_file_instance.chk is None: + raise ValueError("Checksum is not provided.") + + hash_path.write_text(original_file_instance.chk) + + +def create_data_file_list( + file_list: list[pathlib.Path], + ingestor_directory: pathlib.Path, + config: FileHandlingOptions, + logger: logging.Logger, +) -> list[DataFileListItem]: + """ + Create a list of ``DataFileListItem`` instances for the files provided. + + Params + ------ + file_list: + Paths to the files that will be ingested + - nexus_file(mandatory) + - done_writing_message_file(optional) + - nexus_structure_file(optional) + + """ + from functools import partial + + single_file_constructor = partial( + _create_single_data_file_list_item, + file_hash_algorithm=config.file_hash_algorithm, + compute_file_stats=config.compute_file_stats, ) - hash_path.write_text(file_hash) - return json.loads( - single_file_template.render( - file_absolute_path=hash_path.absolute(), - file_size=(file_stats := hash_path.stat()).st_size, - datetime_isoformat=datetime.datetime.fromtimestamp( - file_stats.st_ctime, tz=datetime.UTC - ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), - uid=str(file_stats.st_uid), - gid=str(file_stats.st_gid), - perm=oct(file_stats.st_mode), + # Collect default data-file items + data_file_list = [] + for minimum_file_path in file_list: + logger.info("Adding file %s to the datafiles list", minimum_file_path) + new_file_item = single_file_constructor( + file_path=minimum_file_path, + calculate_checksum=config.compute_file_hash, ) - ) + data_file_list.append(new_file_item) + if config.save_file_hash: + logger.info( + "Computing hash of the file(%s) from disk...", minimum_file_path + ) + hash_file_path = _build_hash_path( + original_file_instance=new_file_item, + dir_path=ingestor_directory, + hash_file_extension=config.hash_file_extension, + ) + logger.info("Saving hash into a file ... %s", hash_file_path) + if new_file_item.chk is not None: + _save_hash_file( + original_file_instance=new_file_item, hash_path=hash_file_path + ) + data_file_list.append( + single_file_constructor( + file_path=hash_file_path, calculate_checksum=False + ) + ) + else: + logger.warning( + "File(%s) instance does not have checksum. " + "Probably the file does not exist. " + "Skip saving...", + minimum_file_path, + ) + + return data_file_list diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index e750aba..0bed98f 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -21,13 +21,11 @@ ) from scicat_dataset import ( convert_to_type, + create_data_file_list, ) from scicat_logging import build_logger from scicat_metadata import collect_schemas, select_applicable_schema -from scicat_path_helpers import ( - compose_ingestor_directory, - compose_ingestor_output_file_path, -) +from scicat_path_helpers import compose_ingestor_directory from system_helpers import exit, offline_ingestor_exit_at_exceptions @@ -113,8 +111,6 @@ def _create_datafilelist_item(file_full_path: pathlib.Path, config, logger): :param logger: :return: """ - logger.info("create_datafilelist_item: adding file %s", file_full_path.absolute()) - datafilelist_item = { "path": file_full_path, "size": 0, @@ -142,87 +138,6 @@ def _create_datafilelist_item(file_full_path: pathlib.Path, config, logger): return datafilelist_item -def _compute_file_checksum_if_needed( - file_full_path: pathlib.Path, ingestor_directory: pathlib.Path, config, logger -): - checksum = "" - datafiles_item = {} - - if config.ingestion.compute_files_hash and os.path.exists(file_full_path): - logger.info("create_datafiles_entry: computing hash of the file from disk") - checksum = _compute_file_checksum( - file_full_path, config.ingestion.file_hash_algorithm - ) - - if config.ingstion.save_hash_in_file: - # file path for hash file - hash_file_full_path = compose_ingestor_output_file_path( - ingestor_directory, - file_full_path.stem, - config.ingestion.hash_file_extension, - ) - logger.info( - "create_datafiles_entry: saving hash in file %s", hash_file_full_path - ) - - # save hash in file - with hash_file_full_path.open('w') as fh: - fh.write(datafiles_item['chk']) - - datafiles_item = _create_datafilelist_item( - hash_file_full_path, config, logger - ) - - return checksum, datafiles_item - - -def _create_datafiles_list( - nexus_file_path: pathlib.Path, - done_writing_message_file_path: pathlib.Path, - ingestor_directory: pathlib.Path, - config, - logger, -) -> list: - """ - Update the file size and creation time according to the configuration - :param nexus_file_path: - :param done_writing_message_file_path, - :param config, - :param logger - :return: - """ - - logger.info( - "create_datafiles_list: adding nexus file %s", nexus_file_path.absolute() - ) - datafiles_list = [_create_datafilelist_item(nexus_file_path, config, logger)] - checksum, datafiles_hash_item = _compute_file_checksum_if_needed( - nexus_file_path, ingestor_directory, config, logger - ) - if checksum: - datafiles_list[0]['chk'] = checksum - if datafiles_hash_item: - datafiles_list.append(datafiles_hash_item) - - if config.ingestion.file_handling.message_to_file: - logger.info( - "create_datafiles_list: adding done writing message file %s", - done_writing_message_file_path.absolute(), - ) - datafiles_list.append( - _create_datafilelist_item(done_writing_message_file_path, config, logger) - ) - checksum, datafiles_hash_item = _compute_file_checksum_if_needed( - nexus_file_path, ingestor_directory, config, logger - ) - if checksum: - datafiles_list[-1]['chk'] = checksum - if datafiles_hash_item: - datafiles_list.append(datafiles_hash_item) - - return datafiles_list - - def _prepare_scicat_dataset( metadata_schema: dict, values: dict, datafilelist: list[dict], config, logger ): @@ -436,13 +351,14 @@ def main() -> None: arg_namespace = arg_parser.parse_args() config = build_scicat_offline_ingestor_config(arg_namespace) ingestion_options = config.ingestion + file_handling_options = ingestion_options.file_handling logger = build_logger(config) # Log the configuration as dictionary so that it is easier to read from the logs logger.info( - 'Starting the Scicat background Ingestor with the following configuration:' + 'Starting the Scicat background Ingestor with the following configuration: %s', + config.to_dict(), ) - logger.info(config.to_dict()) # Collect all metadata schema configurations schemas = collect_schemas(ingestion_options.schemas_directory) @@ -453,6 +369,7 @@ def main() -> None: "Nexus file to be ingested : %s", nexus_file_path, ) + data_file_paths = [nexus_file_path] done_writing_message_file_path = pathlib.Path() if config.ingestion.file_handling.message_to_file: done_writing_message_file_path = pathlib.Path( @@ -465,6 +382,7 @@ def main() -> None: # log done writing message input file logger.info(json.load(done_writing_message_file_path.open())) + data_file_paths.append(done_writing_message_file_path) # define which is the directory where the ingestor should save # the files it creates, if any is created @@ -482,46 +400,20 @@ def main() -> None: metadata_schema['variables'], h5file, config ) - # ============================================= - # I'm not sure that using jinja templates is the right thing to do - # ============================================= - # # Collect data-file descriptions - # data_file_list = [ - # build_single_data_file_desc(nexus_file_path, file_handling_options), - # build_single_data_file_desc( - # done_writing_message_file, file_handling_options - # ), - # # TODO: Add nexus structure file - # ] - # # Create hash of all the files if needed - # if file_handling_options.save_file_hash: - # data_file_list += [ - # save_and_build_single_hash_file_desc( - # data_file_dict, file_handling_options - # ) - # for data_file_dict in data_file_list - # ] - # # Collect all data-files and hash-files descriptions - # _ = [json.dumps(file_dict, indent=2) for file_dict in data_file_list] - - # create datafilelist - datafilelist = _create_datafiles_list( - nexus_file_path, - done_writing_message_file_path, - ingestor_directory, - config, - logger, + # Collect data-file descriptions + data_file_list = create_data_file_list( + data_file_paths, ingestor_directory, file_handling_options, logger ) - dataset_source_folder = _define_dataset_source_folder(datafilelist) + dataset_source_folder = _define_dataset_source_folder(data_file_list) origdatablock_datafiles_list = _prepare_origdatablock_datafilelist( - datafilelist, dataset_source_folder + data_file_list, dataset_source_folder ) # create and populate scicat dataset entry local_dataset = _prepare_scicat_dataset( - metadata_schema, variables_values, datafilelist, config, logger + metadata_schema, variables_values, data_file_list, config, logger ) # create dataset in scicat From f0148b348fd0628ce8c4827d47127bf7c092e377 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 15:04:34 +0200 Subject: [PATCH 18/25] Remove duplicated helper function. --- src/scicat_offline_ingestor.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 0bed98f..0d7935e 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -3,13 +3,11 @@ # import scippnexus as snx import copy import datetime -import hashlib import json import logging import os import pathlib import uuid -from typing import Any from urllib.parse import urljoin import h5py @@ -80,29 +78,6 @@ def extract_variables_values( return values -def _new_hash(algorithm: str) -> Any: - try: - return hashlib.new(algorithm, usedforsecurity=False) - except TypeError: - # Fallback for Python < 3.9 - return hashlib.new(algorithm) - - -def _compute_file_checksum(file_full_path: pathlib.Path, algorithm: str) -> str: - """ - Compute the checksum of a file using specified algorithm. - :param file_full_path: - :param algorithm: - :return: - """ - chk = _new_hash(algorithm) - buffer = memoryview(bytearray(128 * 1024)) - with file_full_path.open("rb", buffering=0) as file: - for n in iter(lambda: file.readinto(buffer), 0): - chk.update(buffer[:n]) - return chk.hexdigest() # type: ignore[no-any-return] - - def _create_datafilelist_item(file_full_path: pathlib.Path, config, logger): """ Create the matching entry in the datafiles list for the file provided From 9ba834b6e8e92132cd99b28bec08fb5f0ee3a9ee Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 15:13:36 +0200 Subject: [PATCH 19/25] Update data file list constructing helper signature. --- src/scicat_dataset.py | 31 ++++++++++++++++++++++++------- src/scicat_offline_ingestor.py | 26 +++++++------------------- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index f2790b6..a9bbaba 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -183,7 +183,10 @@ def _save_hash_file( def create_data_file_list( - file_list: list[pathlib.Path], + *, + nexus_file: pathlib.Path, + done_writing_message_file: pathlib.Path | None = None, + nexus_structure_file: pathlib.Path | None = None, ingestor_directory: pathlib.Path, config: FileHandlingOptions, logger: logging.Logger, @@ -193,11 +196,18 @@ def create_data_file_list( Params ------ - file_list: - Paths to the files that will be ingested - - nexus_file(mandatory) - - done_writing_message_file(optional) - - nexus_structure_file(optional) + nexus_file: + Path to the NeXus file. + done_writing_message_file: + Path to the "done writing" message file. + nexus_structure_file: + Path to the NeXus structure file. + ingestor_directory: + Path to the directory where the files will be saved. + config: + Configuration related to the file handling. + logger: + Logger instance. """ from functools import partial @@ -208,7 +218,14 @@ def create_data_file_list( compute_file_stats=config.compute_file_stats, ) - # Collect default data-file items + # Collect the files that will be ingested + file_list = [nexus_file] + if done_writing_message_file is not None: + file_list.append(done_writing_message_file) + if nexus_structure_file is not None: + file_list.append(nexus_structure_file) + + # Create the list of the files data_file_list = [] for minimum_file_path in file_list: logger.info("Adding file %s to the datafiles list", minimum_file_path) diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 0d7935e..08ba812 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -326,7 +326,7 @@ def main() -> None: arg_namespace = arg_parser.parse_args() config = build_scicat_offline_ingestor_config(arg_namespace) ingestion_options = config.ingestion - file_handling_options = ingestion_options.file_handling + fh_options = ingestion_options.file_handling logger = build_logger(config) # Log the configuration as dictionary so that it is easier to read from the logs @@ -344,26 +344,10 @@ def main() -> None: "Nexus file to be ingested : %s", nexus_file_path, ) - data_file_paths = [nexus_file_path] - done_writing_message_file_path = pathlib.Path() - if config.ingestion.file_handling.message_to_file: - done_writing_message_file_path = pathlib.Path( - config.offline_run.done_writing_message_file - ) - logger.info( - "Done writing message file linked to nexus file : %s", - done_writing_message_file_path, - ) - - # log done writing message input file - logger.info(json.load(done_writing_message_file_path.open())) - data_file_paths.append(done_writing_message_file_path) # define which is the directory where the ingestor should save # the files it creates, if any is created - ingestor_directory = compose_ingestor_directory( - config.ingestion.file_handling, nexus_file_path - ) + ingestor_directory = compose_ingestor_directory(fh_options, nexus_file_path) # open nexus file with h5py with h5py.File(nexus_file_path) as h5file: @@ -377,7 +361,11 @@ def main() -> None: # Collect data-file descriptions data_file_list = create_data_file_list( - data_file_paths, ingestor_directory, file_handling_options, logger + nexus_file=nexus_file_path, + ingestor_directory=ingestor_directory, + config=fh_options, + logger=logger, + # TODO: add done_writing_message_file and nexus_structure_file ) dataset_source_folder = _define_dataset_source_folder(data_file_list) From 4ef4aac7d1d9d6cc2b295d811c2350d10745239d Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 15:26:28 +0200 Subject: [PATCH 20/25] Remove duplicating helper. --- src/scicat_offline_ingestor.py | 36 ---------------------------------- 1 file changed, 36 deletions(-) diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 08ba812..3a065ea 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -2,7 +2,6 @@ # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) # import scippnexus as snx import copy -import datetime import json import logging import os @@ -78,41 +77,6 @@ def extract_variables_values( return values -def _create_datafilelist_item(file_full_path: pathlib.Path, config, logger): - """ - Create the matching entry in the datafiles list for the file provided - :param file_full_path: - :param config: - :param logger: - :return: - """ - datafilelist_item = { - "path": file_full_path, - "size": 0, - "time": datetime.datetime.now(tz=datetime.UTC).strftime( - "%Y-%m-%dT%H:%M:%S.000Z" - ), - } - - if config.ingestion.compute_files_stats and file_full_path.exists(): - logger.info("create_datafilelist_item: reading file stats from disk") - stats = file_full_path.stat() - datafilelist_item = { - **datafilelist_item, - **{ - "size": stats.st_size, - "time": datetime.datetime.fromtimestamp( - stats.st_ctime, tz=datetime.UTC - ).strftime("%Y-%m-%dT%H:%M:%S.000Z"), - "uid": stats.st_uid, - "gid": stats.st_gid, - "perm": stats.st_mode, - }, - } - - return datafilelist_item - - def _prepare_scicat_dataset( metadata_schema: dict, values: dict, datafilelist: list[dict], config, logger ): From 24c027234c47422ef4786747e37e20228c8824de Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 16:58:22 +0200 Subject: [PATCH 21/25] Refactor scicat dataset instance creation. --- src/scicat_dataset.py | 201 +++++++++++++++++++++++++++++++-- src/scicat_metadata.py | 10 ++ src/scicat_offline_ingestor.py | 111 +++--------------- 3 files changed, 218 insertions(+), 104 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index a9bbaba..b77f50c 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -3,11 +3,19 @@ import datetime import logging import pathlib -from dataclasses import dataclass +import uuid +from collections.abc import Iterable +from dataclasses import asdict, dataclass from types import MappingProxyType from typing import Any -from scicat_configuration import FileHandlingOptions +from scicat_configuration import DatasetOptions, FileHandlingOptions +from scicat_metadata import ( + HIGH_LEVEL_METADATA_TYPE, + SCIENTIFIC_METADATA_TYPE, + VALID_METADATA_TYPES, + render_variable_value, +) def to_string(value: Any) -> str: @@ -41,6 +49,7 @@ def to_date(value: Any) -> str | None: "integer": to_integer, "float": to_float, "date": to_date, + # TODO: Add email converter } ) @@ -64,7 +73,10 @@ class TechniqueDesc: @dataclass(kw_only=True) class ScicatDataset: - pid: str + pid: str | None + size: int + numberOfFiles: int + isPublished: bool = False datasetName: str description: str principalInvestigator: str @@ -76,12 +88,12 @@ class ScicatDataset: contactEmail: str creationTime: str type: str = "raw" - techniques: list[TechniqueDesc] | None = None - instrumentId: str sampleId: str - proposalId: str - ownerGroup: str - accessGroup: list[str] + techniques: list[TechniqueDesc] | None = None + instrumentId: str | None = None + proposalId: str | None = None + ownerGroup: str | None = None + accessGroup: list[str] | None = None @dataclass(kw_only=True) @@ -255,10 +267,181 @@ def create_data_file_list( ) else: logger.warning( - "File(%s) instance does not have checksum. " + "File instance of (%s) does not have checksum. " "Probably the file does not exist. " "Skip saving...", minimum_file_path, ) return data_file_list + + +def _filter_by_field_type(schemas: Iterable[dict], field_type: str) -> list[dict]: + return [field for field in schemas if field["field_type"] == field_type] + + +def _render_variable_as_type(value: str, variable_map: dict, dtype: str) -> Any: + return convert_to_type(render_variable_value(value, variable_map), dtype) + + +def _create_scientific_metadata( + *, + metadata_schema_id: str, + sm_schemas: list[dict], + variable_map: dict, +) -> dict: + """Create scientific metadata from the metadata schema configuration. + + Params + ------ + metadata_schema_id: + The ID of the metadata schema configuration. + sm_schemas: + The scientific metadata schema configuration. + variable_map: + The variable map to render the scientific metadata values. + + """ + return { + # Default field + "ingestor_metadata_schema_id": { + "value": metadata_schema_id, + "unit": "", + "human_name": "Ingestor metadata schema ID", + "type": "string", + }, + **{ + field["machine_name"]: { + "value": _render_variable_as_type( + field["value"], variable_map, field["type"] + ), + "unit": field.get("unit", ""), + "human_name": field.get("human_name", field["machine_name"]), + "type": field["type"], + } + for field in sm_schemas + }, + } + + +def _validate_metadata_schemas( + metadata_schemas: dict[str, dict], +) -> None: + if any( + invalid_types := [ + field["field_type"] + for field in metadata_schemas.values() + if field["field_type"] not in VALID_METADATA_TYPES + ] + ): + raise ValueError( + "Invalid metadata schema types found. Valid types are: ", + VALID_METADATA_TYPES, + "Got: ", + invalid_types, + ) + + +def create_scicat_dataset_instance( + *, + metadata_schema_id: str, # metadata-schema["id"] + metadata_schemas: dict[str, dict], # metadata-schema["schema"] + variable_map: dict, + data_file_list: list[DataFileListItem], + config: DatasetOptions, + logger: logging.Logger, +) -> ScicatDataset: + """ + Prepare the ``ScicatDataset`` instance. + + Params + ------ + metadata_schema: + Metadata schema. + variables_values: + Variables values. + data_file_list: + List of the data files. + config: + Configuration related to scicat dataset instance. + logger: + Logger instance. + + """ + _validate_metadata_schemas(metadata_schemas) + # Create the dataset instance + scicat_dataset = ScicatDataset( + size=sum([file.size for file in data_file_list if file.size is not None]), + numberOfFiles=len(data_file_list), + isPublished=False, + scientificMetadata=_create_scientific_metadata( + metadata_schema_id=metadata_schema_id, + sm_schemas=_filter_by_field_type( + metadata_schemas.values(), SCIENTIFIC_METADATA_TYPE + ), # Scientific metadata schemas + variable_map=variable_map, + ), + **{ + field["machine_name"]: _render_variable_as_type( + field["value"], variable_map, field["type"] + ) + for field in _filter_by_field_type( + metadata_schemas.values(), HIGH_LEVEL_METADATA_TYPE + ) + # High level schemas + }, + ) + + # Auto generate or assign default values if needed + if not config.allow_dataset_pid: + logger.info("PID is not allowed in the dataset by configuration.") + scicat_dataset.pid = None + elif config.generate_dataset_pid: + logger.info("Auto generating PID for the dataset based on the configuration.") + scicat_dataset.pid = uuid.uuid4().hex + if scicat_dataset.instrumentId is None: + scicat_dataset.instrumentId = config.default_instrument_id + logger.info( + "Instrument ID is not provided. Setting to default value. %s", + scicat_dataset.instrumentId, + ) + if scicat_dataset.proposalId is None: + scicat_dataset.proposalId = config.default_proposal_id + logger.info( + "Proposal ID is not provided. Setting to default value. %s", + scicat_dataset.proposalId, + ) + if scicat_dataset.ownerGroup is None: + scicat_dataset.ownerGroup = config.default_owner_group + logger.info( + "Owner group is not provided. Setting to default value. %s", + scicat_dataset.ownerGroup, + ) + if scicat_dataset.accessGroup is None: + scicat_dataset.accessGroup = config.default_access_groups + logger.info( + "Access group is not provided. Setting to default value. %s", + scicat_dataset.accessGroup, + ) + if scicat_dataset.techniques is None: + logger.info("Techniques are not provided. Setting to empty list.") + scicat_dataset.techniques = [] + + logger.info("Dataset instance is created successfully. %s", scicat_dataset) + return scicat_dataset + + +def scicat_dataset_to_dict(dataset: ScicatDataset) -> dict: + """ + Convert the ``dataset`` to a dictionary. + + It removes the ``None`` values from the dictionary. + You can add more handlings for specific fields here if needed. + + Params + ------ + dataset: + Scicat dataset instance. + + """ + return {k: v for k, v in asdict(dataset).items() if v is not None} diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index 3367588..fca5d87 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -5,6 +5,10 @@ from collections.abc import Callable from importlib.metadata import entry_points +SCIENTIFIC_METADATA_TYPE = "scientific_metadata" +HIGH_LEVEL_METADATA_TYPE = "high_level" +VALID_METADATA_TYPES = (SCIENTIFIC_METADATA_TYPE, HIGH_LEVEL_METADATA_TYPE) + def load_metadata_extractors(extractor_name: str) -> Callable: """Load metadata extractors from the entry points.""" @@ -77,3 +81,9 @@ def select_applicable_schema(nexus_file, nxs, schemas): return schema raise Exception("No applicable metadata schema configuration found!!") + + +def render_variable_value(var_value: str, variable_registry: dict) -> str: + for var_name, var_value in variable_registry.items(): + var_value = var_value.replace("<" + var_name + ">", str(var_value)) + return var_value diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 3a065ea..cf64b75 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -6,7 +6,6 @@ import logging import os import pathlib -import uuid from urllib.parse import urljoin import h5py @@ -19,6 +18,8 @@ from scicat_dataset import ( convert_to_type, create_data_file_list, + create_scicat_dataset_instance, + scicat_dataset_to_dict, ) from scicat_logging import build_logger from scicat_metadata import collect_schemas, select_applicable_schema @@ -77,91 +78,6 @@ def extract_variables_values( return values -def _prepare_scicat_dataset( - metadata_schema: dict, values: dict, datafilelist: list[dict], config, logger -): - """ - Prepare scicat dataset as dictionary ready to be ``POST``ed. - """ - logger.info("_prepare_scicat_dataset: Preparing scicat dataset structure") - schema: dict = metadata_schema["schema"] - dataset: dict = {} - - scientific_metadata = { - 'ingestor_metadata_schema_id': { - "value": metadata_schema["id"], - "unit": "", - "human_name": "Ingestor Metadata Schema Id", - "type": "string", - } - } - for field in schema.values(): - machine_name = field["machine_name"] - field_type = field["type"] - if field["field_type"] == "high_level": - dataset[machine_name] = convert_to_type( - replace_variables_values(field["value"], values), field_type - ) - elif field["field_type"] == "scientific_metadata": - scientific_metadata[machine_name] = { - "value": convert_to_type( - replace_variables_values(field["value"], values), field_type - ), - "unit": "", - "human_name": field["human_name"] - if field.get("human_name", None) - else machine_name, - "type": field_type, - } - else: - raise Exception("Metadata schema field type invalid") - - dataset["scientific_metadata"] = scientific_metadata - - # now check that the configuration setting shave been respected - if not config.dataset.allow_dataset_pid and "pid" in dataset.keys(): - logger.info("_prepare_scicat_dataset: Pid not allowed by configuration") - del dataset["pid"] - if config.dataset.generate_dataset_pid: - logger.info("_prepare_scicat_dataset: Auto generating pid by configuration") - dataset["pid"] = str(uuid.uuid4()) - - if "instrumentId" not in dataset.keys() or not dataset["instrumentId"]: - logger.info( - "_prepare_scicat_dataset: Assigning default instrument id: %s", - config.dataset.default_instrument_id, - ) - dataset["instrumentId"] = config.dataset.default_instrument_id - - if "proposalId" not in dataset.keys() or not dataset["proposalId"]: - logger.info( - "_prepare_scicat_dataset: Assigning default proposal id: %s", - config.dataset.default_proposal_id, - ) - dataset["proposalId"] = config.dataset.default_proposal_id - - if "ownerGroup" not in dataset.keys() or not dataset["ownerGroup"]: - logger.info( - "_prepare_scicat_dataset: Assigning default ownerGroup: %s", - config.dataset.default_owner_group, - ) - dataset["ownerGroup"] = config.dataset.default_owner_group - - if "accessGroups" not in dataset.keys() or not dataset["accessGroups"]: - logger.info( - "_prepare_scicat_dataset: Assigning default accessGroups: %s", - json.dumps(config.dataset.default_access_groups), - ) - dataset["accessGroups"] = config.dataset.default_access_groups - - dataset["size"] = len(datafilelist) - dataset["numberOfFiles"] = sum([item["size"] for item in datafilelist]) - dataset["isPublished"] = False - - logger.info("_prepare_scicat_dataset: Scicat dataset: %s", json.dumps(dataset)) - return dataset - - def _create_scicat_dataset(dataset: dict, config, logger: logging.Logger) -> dict: """ Execute a POST request to scicat to create a dataset @@ -332,20 +248,25 @@ def main() -> None: # TODO: add done_writing_message_file and nexus_structure_file ) + # Create scicat dataset instance(entry) + local_dataset = scicat_dataset_to_dict( + create_scicat_dataset_instance( + metadata_schema_id=metadata_schema["id"], + metadata_schemas=metadata_schema["schemas"], + variable_map=variables_values, + data_file_list=data_file_list, + config=config.dataset, + logger=logger, + ) + ) + # create dataset in scicat + scicat_dataset = _create_scicat_dataset(local_dataset, config, logger) + dataset_source_folder = _define_dataset_source_folder(data_file_list) origdatablock_datafiles_list = _prepare_origdatablock_datafilelist( data_file_list, dataset_source_folder ) - - # create and populate scicat dataset entry - local_dataset = _prepare_scicat_dataset( - metadata_schema, variables_values, data_file_list, config, logger - ) - - # create dataset in scicat - scicat_dataset = _create_scicat_dataset(local_dataset, config, logger) - # create and populate scicat origdatablock entry # with files and hashes previously computed local_origdatablock = _prepare_scicat_origdatablock( From 3f3359e79ffeeb2e19e50415cc8f51f7a61263ab Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 17:35:51 +0200 Subject: [PATCH 22/25] Refactor variable extraction logic. --- src/scicat_communication.py | 17 ++++++++++ src/scicat_dataset.py | 45 +++++++++++++++++++++++-- src/scicat_metadata.py | 4 +++ src/scicat_offline_ingestor.py | 60 +++------------------------------- 4 files changed, 68 insertions(+), 58 deletions(-) create mode 100644 src/scicat_communication.py diff --git a/src/scicat_communication.py b/src/scicat_communication.py new file mode 100644 index 0000000..ab75ab2 --- /dev/null +++ b/src/scicat_communication.py @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) +import requests +from scicat_configuration import SciCatOptions + + +def retrieve_value_from_scicat( + *, + config: SciCatOptions, + variable_url: str, # It should be already rendered from variable_recipe["url"] + field_name: str, # variable_recipe["field"] +) -> str: + url = config.host.removesuffix('/') + variable_url + response: dict = requests.get( + url, headers={"token": config.token}, timeout=config.timeout + ).json() + return response[field_name] diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index b77f50c..fab2579 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -4,12 +4,14 @@ import logging import pathlib import uuid -from collections.abc import Iterable +from collections.abc import Callable, Iterable from dataclasses import asdict, dataclass from types import MappingProxyType from typing import Any -from scicat_configuration import DatasetOptions, FileHandlingOptions +import h5py +from scicat_communication import retrieve_value_from_scicat +from scicat_configuration import DatasetOptions, FileHandlingOptions, SciCatOptions from scicat_metadata import ( HIGH_LEVEL_METADATA_TYPE, SCIENTIFIC_METADATA_TYPE, @@ -63,6 +65,45 @@ def convert_to_type(input_value: Any, dtype_desc: str) -> Any: return converter(input_value) +_OPERATOR_REGISTRY = MappingProxyType( + { + "DO_NOTHING": lambda value: value, + "join_with_space": lambda value: ", ".join(value), + } +) + + +def _get_operator(operator: str | None) -> Callable: + return _OPERATOR_REGISTRY.get(operator or "DO_NOTHING", lambda _: _) + + +def extract_variables_values( + variables: dict[str, dict], h5file: h5py.File, config: SciCatOptions +) -> dict: + variable_map = {} + for variable_name, variable_recipe in variables.items(): + if (source := variable_recipe["source"]) == "NXS": + value = h5file[variable_recipe["path"]][...] + elif source == "SC": + value = retrieve_value_from_scicat( + config=config, + variable_url=render_variable_value( + variable_recipe["url"], variable_map + ), + field_name=variable_recipe["field"], + ) + elif source == "VALUE": + value = _get_operator(variable_recipe.get("operator"))( + render_variable_value(variable_recipe["value"], variable_map) + ) + else: + raise Exception("Invalid variable source: ", source) + variable_map[variable_name] = convert_to_type( + value, variable_recipe["value_type"] + ) + return variable_map + + @dataclass(kw_only=True) class TechniqueDesc: pid: str diff --git a/src/scicat_metadata.py b/src/scicat_metadata.py index fca5d87..572d50a 100644 --- a/src/scicat_metadata.py +++ b/src/scicat_metadata.py @@ -86,4 +86,8 @@ def select_applicable_schema(nexus_file, nxs, schemas): def render_variable_value(var_value: str, variable_registry: dict) -> str: for var_name, var_value in variable_registry.items(): var_value = var_value.replace("<" + var_name + ">", str(var_value)) + + if "<" in var_value and ">" in var_value: + raise Exception(f"Unresolved variable: {var_value}") + return var_value diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index cf64b75..67e9a3a 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -11,14 +11,13 @@ import h5py import requests from scicat_configuration import ( - OfflineIngestorConfig, build_offline_ingestor_arg_parser, build_scicat_offline_ingestor_config, ) from scicat_dataset import ( - convert_to_type, create_data_file_list, create_scicat_dataset_instance, + extract_variables_values, scicat_dataset_to_dict, ) from scicat_logging import build_logger @@ -27,57 +26,6 @@ from system_helpers import exit, offline_ingestor_exit_at_exceptions -def replace_variables_values(url: str, values: dict) -> str: - for key, value in values.items(): - url = url.replace("{" + key + "}", str(value)) - return url - - -def extract_variables_values( - variables: dict, h5file, config: OfflineIngestorConfig -) -> dict: - values = {} - - # loop on all the variables defined - for variable in variables.keys(): - source = variables[variable]["source"] - value = "" - if source == "NXS": - # extract value from nexus file - # we need to address path entry/user_*/name - value = h5file[variables[variable]["path"]][...] - elif source == "SC": - # build url - url = replace_variables_values( - config[""]["scicat_url"] + variables[variable]["url"], values - ) - # retrieve value from SciCat - response = requests.get( - url, - headers={"token": config[""]["token"]}, - timeout=10, # TODO: decide timeout. Maybe from configuration? - ) - # extract value - value = response.json()[variables[variable]["field"]] - elif source == "VALUE": - # the value is the one indicated - # there might be some substitution needed - value = replace_variables_values(variables[variable]["value"], values) - if ( - "operator" in variables[variable].keys() - and variables[variable]["operator"] - ): - operator = variables[variable]["operator"] - if operator == "join_with_space": - value = ", ".join(value) - else: - raise Exception("Invalid variable source configuration") - - values[variable] = convert_to_type(value, variables[variable]["value_type"]) - - return values - - def _create_scicat_dataset(dataset: dict, config, logger: logging.Logger) -> dict: """ Execute a POST request to scicat to create a dataset @@ -235,8 +183,8 @@ def main() -> None: metadata_schema = select_applicable_schema(nexus_file_path, h5file, schemas) # define variables values - variables_values = extract_variables_values( - metadata_schema['variables'], h5file, config + variable_map = extract_variables_values( + metadata_schema['variables'], h5file, config.scicat ) # Collect data-file descriptions @@ -253,7 +201,7 @@ def main() -> None: create_scicat_dataset_instance( metadata_schema_id=metadata_schema["id"], metadata_schemas=metadata_schema["schemas"], - variable_map=variables_values, + variable_map=variable_map, data_file_list=data_file_list, config=config.dataset, logger=logger, From f54789742722b009edbead2c7ccdbf28576e4615 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 18:07:36 +0200 Subject: [PATCH 23/25] Add communication helper. --- src/scicat_communication.py | 39 ++++++++++++++++++++++++++++++++++ src/scicat_offline_ingestor.py | 35 ++++-------------------------- 2 files changed, 43 insertions(+), 31 deletions(-) diff --git a/src/scicat_communication.py b/src/scicat_communication.py index ab75ab2..4d0b1ee 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -1,5 +1,8 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject) +import logging +from urllib.parse import urljoin + import requests from scicat_configuration import SciCatOptions @@ -15,3 +18,39 @@ def retrieve_value_from_scicat( url, headers={"token": config.token}, timeout=config.timeout ).json() return response[field_name] + + +class ScicatDatasetAPIError(Exception): + pass + + +def create_scicat_dataset( + *, dataset: dict, config: SciCatOptions, logger: logging.Logger +) -> dict: + """ + Execute a POST request to scicat to create a dataset + """ + logger.info("_create_scicat_dataset: Sending POST request to create new dataset") + response = requests.request( + method="POST", + url=urljoin(config.host, "datasets"), + json=dataset, + headers={"token": config.token, **config.headers}, + timeout=config.timeout, + stream=False, + verify=True, + ) + + result: dict = response.json() + if not response.ok: + logger.error( + "Failed to create new dataset. \nError message from scicat backend: \n%s", + result.get("error", {}), + ) + raise ScicatDatasetAPIError(f"Error creating new dataset: \n{dataset}") + + logger.info( + "Dataset created successfully. Dataset pid: %s", + result.get("pid"), + ) + return result diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 67e9a3a..159baf6 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -10,6 +10,7 @@ import h5py import requests +from scicat_communication import create_scicat_dataset from scicat_configuration import ( build_offline_ingestor_arg_parser, build_scicat_offline_ingestor_config, @@ -26,36 +27,6 @@ from system_helpers import exit, offline_ingestor_exit_at_exceptions -def _create_scicat_dataset(dataset: dict, config, logger: logging.Logger) -> dict: - """ - Execute a POST request to scicat to create a dataset - """ - logger.info("_create_scicat_dataset: Sending POST request to create new dataset") - response = requests.request( - method="POST", - url=urljoin(config.scicat.host, "datasets"), - json=dataset, - headers=config.scicat.headers, - timeout=config.scicat.timeout, - stream=False, - verify=True, - ) - - result = response.json() - if not response.ok: - err = result.get("error", {}) - logger.error( - "_create_scicat_dataset: Failed to create new dataset. Error %s", err - ) - raise Exception(f"Error creating new dataset: {err}") - - logger.info( - "_create_scicat_dataset: Dataset created successfully. Dataset pid: %s", - result['pid'], - ) - return result - - def _prepare_scicat_origdatablock(scicat_dataset, datafilelist, config, logger): """ Create local copy of the orig datablock to send to scicat @@ -208,7 +179,9 @@ def main() -> None: ) ) # create dataset in scicat - scicat_dataset = _create_scicat_dataset(local_dataset, config, logger) + scicat_dataset = create_scicat_dataset( + dataset=local_dataset, config=config.scicat, logger=logger + ) dataset_source_folder = _define_dataset_source_folder(data_file_list) From 6a13d5bf1d280f526ba4173f7882afc82ea196fd Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Tue, 6 Aug 2024 18:13:30 +0200 Subject: [PATCH 24/25] Extract duplicating function bodies. --- src/scicat_communication.py | 57 +++++++++++++++++++++++++++++----- src/scicat_offline_ingestor.py | 46 ++------------------------- 2 files changed, 53 insertions(+), 50 deletions(-) diff --git a/src/scicat_communication.py b/src/scicat_communication.py index 4d0b1ee..d9216c0 100644 --- a/src/scicat_communication.py +++ b/src/scicat_communication.py @@ -24,23 +24,31 @@ class ScicatDatasetAPIError(Exception): pass +def _post_to_scicat(*, url: str, posting_obj: dict, headers: dict, timeout: int): + return requests.request( + method="POST", + url=url, + json=posting_obj, + headers=headers, + timeout=timeout, + stream=False, + verify=True, + ) + + def create_scicat_dataset( *, dataset: dict, config: SciCatOptions, logger: logging.Logger ) -> dict: """ Execute a POST request to scicat to create a dataset """ - logger.info("_create_scicat_dataset: Sending POST request to create new dataset") - response = requests.request( - method="POST", + logger.info("Sending POST request to create new dataset") + response = _post_to_scicat( url=urljoin(config.host, "datasets"), - json=dataset, + posting_obj=dataset, headers={"token": config.token, **config.headers}, timeout=config.timeout, - stream=False, - verify=True, ) - result: dict = response.json() if not response.ok: logger.error( @@ -54,3 +62,38 @@ def create_scicat_dataset( result.get("pid"), ) return result + + +class ScicatOrigDatablockAPIError(Exception): + pass + + +def create_scicat_origdatablock( + *, origdatablock: dict, config: SciCatOptions, logger: logging.Logger +) -> dict: + """ + Execute a POST request to scicat to create a new origdatablock + """ + logger.info("Sending POST request to create new origdatablock") + response = _post_to_scicat( + url=urljoin(config.host, "origdatablocks"), + posting_obj=origdatablock, + headers={"token": config.token, **config.headers}, + timeout=config.timeout, + ) + result: dict = response.json() + if not response.ok: + logger.error( + "Failed to create new origdatablock. " + "Error message from scicat backend: \n%s", + result.get("error", {}), + ) + raise ScicatOrigDatablockAPIError( + f"Error creating new origdatablock: \n{origdatablock}" + ) + + logger.info( + "Origdatablock created successfully. Origdatablock pid: %s", + result['_id'], + ) + return result diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py index 159baf6..87e6b29 100644 --- a/src/scicat_offline_ingestor.py +++ b/src/scicat_offline_ingestor.py @@ -3,14 +3,11 @@ # import scippnexus as snx import copy import json -import logging import os import pathlib -from urllib.parse import urljoin import h5py -import requests -from scicat_communication import create_scicat_dataset +from scicat_communication import create_scicat_dataset, create_scicat_origdatablock from scicat_configuration import ( build_offline_ingestor_arg_parser, build_scicat_offline_ingestor_config, @@ -50,43 +47,6 @@ def _prepare_scicat_origdatablock(scicat_dataset, datafilelist, config, logger): return origdatablock -def _create_scicat_origdatablock( - origdatablock: dict, config, logger: logging.Logger -) -> dict: - """ - Execute a POST request to scicat to create a new origdatablock - """ - logger.info( - "_create_scicat_origdatablock: Sending POST request to create new origdatablock" - ) - response = requests.request( - method="POST", - url=urljoin(config.scicat.host, "origdatablocks"), - json=origdatablock, - headers=config.scicat.headers, - timeout=config.scicat.timeout, - stream=False, - verify=True, - ) - - result = response.json() - if not response.ok: - err = result.get("error", {}) - logger.error( - "_create_scicat_origdatablock: Failed to create new origdatablock." - "Error %s", - err, - ) - raise Exception(f"Error creating new origdatablock: {err}") - - logger.info( - "_create_scicat_origdatablock: Origdatablock created successfully. " - "Origdatablock pid: %s", - result['_id'], - ) - return result - - def _define_dataset_source_folder(datafilelist) -> pathlib.Path: """ Return the dataset source folder, which is the common path @@ -195,8 +155,8 @@ def main() -> None: ) # create origdatablock in scicat - scicat_origdatablock = _create_scicat_origdatablock( - local_origdatablock, config, logger + scicat_origdatablock = create_scicat_origdatablock( + origdatablock=local_origdatablock, config=config.scicat, logger=logger ) # check one more time if we successfully created the entries in scicat From bc30a8a49e81f7eff28c7cff45b954fa9a835b30 Mon Sep 17 00:00:00 2001 From: YooSunyoung Date: Wed, 7 Aug 2024 17:11:29 +0200 Subject: [PATCH 25/25] Use default factory for techniques field. --- src/scicat_dataset.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/scicat_dataset.py b/src/scicat_dataset.py index fab2579..ce15d27 100644 --- a/src/scicat_dataset.py +++ b/src/scicat_dataset.py @@ -5,7 +5,7 @@ import pathlib import uuid from collections.abc import Callable, Iterable -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, field from types import MappingProxyType from typing import Any @@ -108,7 +108,7 @@ def extract_variables_values( class TechniqueDesc: pid: str "Technique PID" - names: str + name: str "Technique Name" @@ -130,7 +130,7 @@ class ScicatDataset: creationTime: str type: str = "raw" sampleId: str - techniques: list[TechniqueDesc] | None = None + techniques: list[TechniqueDesc] = field(default_factory=list) instrumentId: str | None = None proposalId: str | None = None ownerGroup: str | None = None @@ -464,9 +464,6 @@ def create_scicat_dataset_instance( "Access group is not provided. Setting to default value. %s", scicat_dataset.accessGroup, ) - if scicat_dataset.techniques is None: - logger.info("Techniques are not provided. Setting to empty list.") - scicat_dataset.techniques = [] logger.info("Dataset instance is created successfully. %s", scicat_dataset) return scicat_dataset