Skip to content

Commit

Permalink
Merge pull request #75 from SciCatProject/check-dataset-by-job-id-2
Browse files Browse the repository at this point in the history
Check dataset by job id and metadata
  • Loading branch information
YooSunYoung authored Oct 28, 2024
2 parents b941d83 + cce2dae commit abf6876
Show file tree
Hide file tree
Showing 4 changed files with 207 additions and 37 deletions.
4 changes: 3 additions & 1 deletion resources/config.sample.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
"config_file": "",
"id": "",
"dataset": {
"check_by_job_id": true,
"allow_dataset_pid": true,
"generate_dataset_pid": false,
"dataset_pid_prefix": "20.500.12269",
Expand All @@ -17,6 +16,9 @@
"dry_run": false,
"offline_ingestor_executable": "background_ingestor",
"schemas_directory": "schemas",
"check_if_dataset_exists_by_pid": true,
"check_if_dataset_exists_by_metadata": true,
"check_if_dataset_exists_by_metadata_key": "job_id",
"file_handling": {
"compute_file_stats": true,
"compute_file_hash": true,
Expand Down
79 changes: 78 additions & 1 deletion src/scicat_communication.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
import json
import logging
from urllib.parse import urljoin
from typing import Any
from urllib.parse import quote, urljoin

import requests
from scicat_configuration import SciCatOptions
Expand All @@ -24,6 +26,12 @@ class ScicatDatasetAPIError(Exception):
pass


def _get_from_scicat(
*, url: str, headers: dict, timeout: int, stream: bool, verify: bool
) -> requests.Response:
return requests.get(url, headers=headers, timeout=timeout)


def _post_to_scicat(*, url: str, posting_obj: dict, headers: dict, timeout: int):
return requests.request(
method="POST",
Expand Down Expand Up @@ -97,3 +105,72 @@ def create_scicat_origdatablock(
result['_id'],
)
return result


def check_dataset_by_pid(
pid: str, config: SciCatOptions, logger: logging.Logger
) -> bool:
response = _get_from_scicat(
url=urljoin(config.host, f"datasets/{quote(pid)}"),
headers=config.headers,
timeout=config.timeout,
stream=config.stream,
verify=config.verify,
)
dataset_exists: bool
if not response.ok:
logger.error(
"Failed to check dataset existence by pid with status code: %s. "
"Error message from scicat backend: \n%s\n"
"Assuming the dataset does not exist.",
response.status_code,
response.reason,
)
dataset_exists = False
elif response.json():
logger.info("Dataset with pid %s exists.", pid)
dataset_exists = True
else:
logger.info("Dataset with pid %s does not exist.", pid)
dataset_exists = False

return dataset_exists


def check_dataset_by_metadata(
metadata_key: str,
metadata_value: Any,
config: SciCatOptions,
logger: logging.Logger,
) -> bool:
metadata_dict = {f"scientificMetadata.{metadata_key}.value": metadata_value}
filter_string = '?filter={"where":' + json.dumps(metadata_dict) + "}"
url = urljoin(config.host, "datasets") + filter_string
logger.info("Checking if dataset exists by metadata with url: %s", url)
response = _get_from_scicat(
url=url,
headers=config.headers,
timeout=config.timeout,
stream=config.stream,
verify=config.verify,
)
dataset_exists: bool
if not response.ok:
logger.error(
"Failed to check dataset existence by metadata key %s with status code: %s "
"Error message from scicat backend: \n%s\n"
"Assuming the dataset does not exist.",
metadata_key,
response.status_code,
response.reason,
)
dataset_exists = False
elif response.json():
logger.info("Retrieved %s dataset(s) from SciCat", len(response.json()))
logger.info("Dataset with metadata %s exists.", metadata_dict)
dataset_exists = True
else:
logger.info("Dataset with metadata %s does not exist.", metadata_dict)
dataset_exists = False

return dataset_exists
11 changes: 3 additions & 8 deletions src/scicat_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ class IngestionOptions:
dry_run: bool = False
offline_ingestor_executable: str = "background_ingestor"
schemas_directory: str = "schemas"
check_if_dataset_exists_by_pid: bool = True
check_if_dataset_exists_by_metadata: bool = True
check_if_dataset_exists_by_metadata_key: str = "job_id"
file_handling: FileHandlingOptions = field(default_factory=FileHandlingOptions)


Expand All @@ -219,7 +222,6 @@ def default_access_groups() -> list[str]:

@dataclass(kw_only=True)
class DatasetOptions:
check_by_job_id: bool = True
allow_dataset_pid: bool = True
generate_dataset_pid: bool = False
dataset_pid_prefix: str = "20.500.12269"
Expand All @@ -238,13 +240,6 @@ class SciCatOptions:
stream: bool = True
verify: bool = False

@classmethod
def from_configurations(cls, config: dict) -> "SciCatOptions":
"""Create SciCatOptions from a dictionary."""
options = cls(**config)
options.headers = {"Authorization": f"Bearer {options.token}"}
return options


@dataclass(kw_only=True)
class OnlineIngestorConfig:
Expand Down
150 changes: 123 additions & 27 deletions src/scicat_offline_ingestor.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,26 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)

import logging
from pathlib import Path

import h5py
from scicat_communication import create_scicat_dataset, create_scicat_origdatablock
from scicat_communication import (
check_dataset_by_metadata,
check_dataset_by_pid,
create_scicat_dataset,
create_scicat_origdatablock,
)
from scicat_configuration import (
IngestionOptions,
OfflineIngestorConfig,
SciCatOptions,
build_arg_parser,
build_dataclass,
merge_config_and_input_args,
)
from scicat_dataset import (
ScicatDataset,
create_data_file_list,
create_origdatablock_instance,
create_scicat_dataset_instance,
Expand All @@ -22,7 +31,7 @@
from scicat_logging import build_logger
from scicat_metadata import collect_schemas, select_applicable_schema
from scicat_path_helpers import compose_ingestor_directory
from system_helpers import handle_exceptions
from system_helpers import exit, handle_exceptions


def build_offline_config() -> OfflineIngestorConfig:
Expand All @@ -41,6 +50,67 @@ def build_offline_config() -> OfflineIngestorConfig:
return build_dataclass(OfflineIngestorConfig, merged_configuration)


def _check_if_dataset_exists_by_pid(
local_dataset: ScicatDataset,
ingest_config: IngestionOptions,
scicat_config: SciCatOptions,
logger: logging.Logger,
) -> bool:
"""
Check if a dataset with the same pid exists already in SciCat.
"""
if ingest_config.check_if_dataset_exists_by_pid and (local_dataset.pid is not None):
logger.info(
"Checking if dataset with pid %s already exists.", local_dataset.pid
)
return check_dataset_by_pid(
pid=local_dataset.pid, config=scicat_config, logger=logger
)

# Other cases, assuming dataset does not exist
return False


def _check_if_dataset_exists_by_metadata(
local_dataset: ScicatDataset,
ingest_config: IngestionOptions,
scicat_config: SciCatOptions,
logger: logging.Logger,
):
"""
Check if a dataset already exists in SciCat where
the metadata key specified has the same value as the dataset that we want to create
"""
if ingest_config.check_if_dataset_exists_by_metadata:
metadata_key = ingest_config.check_if_dataset_exists_by_metadata_key
target_metadata: dict = local_dataset.scientificMetadata.get(metadata_key, {})
metadata_value = target_metadata.get("value")

if metadata_value is not None:
logger.info(
"Checking if dataset with scientific metadata key %s "
"set to value %s already exists.",
metadata_key,
metadata_value,
)
return check_dataset_by_metadata(
metadata_key=metadata_key,
metadata_value=metadata_value,
config=scicat_config,
logger=logger,
)
else:
logger.info(
"No value found for metadata key %s specified for checking dataset.",
metadata_key,
)
else:
logger.info("No metadata key specified for checking dataset existence.")

# Other cases, assuming dataset does not exist
return False


def main() -> None:
"""Main entry point of the app."""
config = build_offline_config()
Expand Down Expand Up @@ -84,21 +154,29 @@ def main() -> None:

# Prepare scicat dataset instance(entry)
logger.info("Preparing scicat dataset instance ...")
local_dataset = scicat_dataset_to_dict(
create_scicat_dataset_instance(
metadata_schema_id=metadata_schema["id"],
metadata_schemas=metadata_schema["schemas"],
variable_map=variable_map,
data_file_list=data_file_list,
config=config.dataset,
logger=logger,
)
local_dataset_instance = create_scicat_dataset_instance(
metadata_schema_id=metadata_schema["id"],
metadata_schemas=metadata_schema["schemas"],
variable_map=variable_map,
data_file_list=data_file_list,
config=config.dataset,
logger=logger,
)
# Check if dataset already exists in SciCat
if _check_if_dataset_exists_by_pid(
local_dataset_instance, config.ingestion, config.scicat, logger
) or _check_if_dataset_exists_by_metadata(
local_dataset_instance, config.ingestion, config.scicat, logger
):
logger.warning(
"Dataset with pid %s already present in SciCat. Skipping it!!!",
local_dataset_instance.pid,
)
exit(logger, unexpected=False)

# If dataset does not exist, continue with the creation of the dataset
local_dataset = scicat_dataset_to_dict(local_dataset_instance)
logger.debug("Scicat dataset: %s", local_dataset)
# Create dataset in scicat
scicat_dataset = create_scicat_dataset(
dataset=local_dataset, config=config.scicat, logger=logger
)

# Prepare origdatablock
logger.info("Preparing scicat origdatablock instance ...")
Expand All @@ -110,17 +188,35 @@ def main() -> None:
)
)
logger.debug("Scicat origdatablock: %s", local_origdatablock)
# create origdatablock in scicat
scicat_origdatablock = create_scicat_origdatablock(
origdatablock=local_origdatablock, config=config.scicat, logger=logger
)

# check one more time if we successfully created the entries in scicat
if not ((len(scicat_dataset) > 0) and (len(scicat_origdatablock) > 0)):
logger.error(
"Failed to create dataset or origdatablock in scicat.\n"
"SciCat dataset: %s\nSciCat origdatablock: %s",
scicat_dataset,
scicat_origdatablock,
# Create dataset in scicat
if config.ingestion.dry_run:
logger.info(
"Dry run mode. Skipping Scicat API calls for creating dataset ..."
)
exit(logger, unexpected=False)
else:
scicat_dataset = create_scicat_dataset(
dataset=local_dataset, config=config.scicat, logger=logger
)

# create origdatablock in scicat
scicat_origdatablock = create_scicat_origdatablock(
origdatablock=local_origdatablock, config=config.scicat, logger=logger
)

# check one more time if we successfully created the entries in scicat
if not ((len(scicat_dataset) > 0) and (len(scicat_origdatablock) > 0)):
logger.error(
"Failed to create dataset or origdatablock in scicat.\n"
"SciCat dataset: %s\nSciCat origdatablock: %s",
scicat_dataset,
scicat_origdatablock,
)
raise RuntimeError("Failed to create dataset or origdatablock.")

# check one more time if we successfully created the entries in scicat
exit(
logger,
unexpected=not (bool(scicat_dataset) and bool(scicat_origdatablock)),
)
raise RuntimeError("Failed to create dataset or origdatablock.")

0 comments on commit abf6876

Please sign in to comment.