Merge pull request #75 from SciCatProject/check-dataset-by-job-id-2

Check dataset by job id and metadata
SciCatProject · Oct 28, 2024 · abf6876 · abf6876
2 parents b941d83 + cce2dae
commit abf6876
Show file tree

Hide file tree

Showing 4 changed files with 207 additions and 37 deletions.
diff --git a/resources/config.sample.json b/resources/config.sample.json
@@ -2,7 +2,6 @@
   "config_file": "",
   "id": "",
   "dataset": {
-    "check_by_job_id": true,
     "allow_dataset_pid": true,
     "generate_dataset_pid": false,
     "dataset_pid_prefix": "20.500.12269",
@@ -17,6 +16,9 @@
     "dry_run": false,
     "offline_ingestor_executable": "background_ingestor",
     "schemas_directory": "schemas",
+    "check_if_dataset_exists_by_pid": true,
+    "check_if_dataset_exists_by_metadata": true,
+    "check_if_dataset_exists_by_metadata_key": "job_id",
     "file_handling": {
       "compute_file_stats": true,
       "compute_file_hash": true,

diff --git a/src/scicat_communication.py b/src/scicat_communication.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
+import json
 import logging
-from urllib.parse import urljoin
+from typing import Any
+from urllib.parse import quote, urljoin
 
 import requests
 from scicat_configuration import SciCatOptions
@@ -24,6 +26,12 @@ class ScicatDatasetAPIError(Exception):
     pass
 
 
+def _get_from_scicat(
+    *, url: str, headers: dict, timeout: int, stream: bool, verify: bool
+) -> requests.Response:
+    return requests.get(url, headers=headers, timeout=timeout)
+
+
 def _post_to_scicat(*, url: str, posting_obj: dict, headers: dict, timeout: int):
     return requests.request(
         method="POST",
@@ -97,3 +105,72 @@ def create_scicat_origdatablock(
         result['_id'],
     )
     return result
+
+
+def check_dataset_by_pid(
+    pid: str, config: SciCatOptions, logger: logging.Logger
+) -> bool:
+    response = _get_from_scicat(
+        url=urljoin(config.host, f"datasets/{quote(pid)}"),
+        headers=config.headers,
+        timeout=config.timeout,
+        stream=config.stream,
+        verify=config.verify,
+    )
+    dataset_exists: bool
+    if not response.ok:
+        logger.error(
+            "Failed to check dataset existence by pid with status code: %s. "
+            "Error message from scicat backend: \n%s\n"
+            "Assuming the dataset does not exist.",
+            response.status_code,
+            response.reason,
+        )
+        dataset_exists = False
+    elif response.json():
+        logger.info("Dataset with pid %s exists.", pid)
+        dataset_exists = True
+    else:
+        logger.info("Dataset with pid %s does not exist.", pid)
+        dataset_exists = False
+
+    return dataset_exists
+
+
+def check_dataset_by_metadata(
+    metadata_key: str,
+    metadata_value: Any,
+    config: SciCatOptions,
+    logger: logging.Logger,
+) -> bool:
+    metadata_dict = {f"scientificMetadata.{metadata_key}.value": metadata_value}
+    filter_string = '?filter={"where":' + json.dumps(metadata_dict) + "}"
+    url = urljoin(config.host, "datasets") + filter_string
+    logger.info("Checking if dataset exists by metadata with url: %s", url)
+    response = _get_from_scicat(
+        url=url,
+        headers=config.headers,
+        timeout=config.timeout,
+        stream=config.stream,
+        verify=config.verify,
+    )
+    dataset_exists: bool
+    if not response.ok:
+        logger.error(
+            "Failed to check dataset existence by metadata key %s with status code: %s "
+            "Error message from scicat backend: \n%s\n"
+            "Assuming the dataset does not exist.",
+            metadata_key,
+            response.status_code,
+            response.reason,
+        )
+        dataset_exists = False
+    elif response.json():
+        logger.info("Retrieved %s dataset(s) from SciCat", len(response.json()))
+        logger.info("Dataset with metadata %s exists.", metadata_dict)
+        dataset_exists = True
+    else:
+        logger.info("Dataset with metadata %s does not exist.", metadata_dict)
+        dataset_exists = False
+
+    return dataset_exists
diff --git a/src/scicat_configuration.py b/src/scicat_configuration.py
@@ -210,6 +210,9 @@ class IngestionOptions:
     dry_run: bool = False
     offline_ingestor_executable: str = "background_ingestor"
     schemas_directory: str = "schemas"
+    check_if_dataset_exists_by_pid: bool = True
+    check_if_dataset_exists_by_metadata: bool = True
+    check_if_dataset_exists_by_metadata_key: str = "job_id"
     file_handling: FileHandlingOptions = field(default_factory=FileHandlingOptions)
 
 
@@ -219,7 +222,6 @@ def default_access_groups() -> list[str]:
 
 @dataclass(kw_only=True)
 class DatasetOptions:
-    check_by_job_id: bool = True
     allow_dataset_pid: bool = True
     generate_dataset_pid: bool = False
     dataset_pid_prefix: str = "20.500.12269"
@@ -238,13 +240,6 @@ class SciCatOptions:
     stream: bool = True
     verify: bool = False
 
-    @classmethod
-    def from_configurations(cls, config: dict) -> "SciCatOptions":
-        """Create SciCatOptions from a dictionary."""
-        options = cls(**config)
-        options.headers = {"Authorization": f"Bearer {options.token}"}
-        return options
-
 
 @dataclass(kw_only=True)
 class OnlineIngestorConfig:

diff --git a/src/scicat_offline_ingestor.py b/src/scicat_offline_ingestor.py
@@ -1,17 +1,26 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2024 ScicatProject contributors (https://github.com/ScicatProject)
 
+import logging
 from pathlib import Path
 
 import h5py
-from scicat_communication import create_scicat_dataset, create_scicat_origdatablock
+from scicat_communication import (
+    check_dataset_by_metadata,
+    check_dataset_by_pid,
+    create_scicat_dataset,
+    create_scicat_origdatablock,
+)
 from scicat_configuration import (
+    IngestionOptions,
     OfflineIngestorConfig,
+    SciCatOptions,
     build_arg_parser,
     build_dataclass,
     merge_config_and_input_args,
 )
 from scicat_dataset import (
+    ScicatDataset,
     create_data_file_list,
     create_origdatablock_instance,
     create_scicat_dataset_instance,
@@ -22,7 +31,7 @@
 from scicat_logging import build_logger
 from scicat_metadata import collect_schemas, select_applicable_schema
 from scicat_path_helpers import compose_ingestor_directory
-from system_helpers import handle_exceptions
+from system_helpers import exit, handle_exceptions
 
 
 def build_offline_config() -> OfflineIngestorConfig:
@@ -41,6 +50,67 @@ def build_offline_config() -> OfflineIngestorConfig:
     return build_dataclass(OfflineIngestorConfig, merged_configuration)
 
 
+def _check_if_dataset_exists_by_pid(
+    local_dataset: ScicatDataset,
+    ingest_config: IngestionOptions,
+    scicat_config: SciCatOptions,
+    logger: logging.Logger,
+) -> bool:
+    """
+    Check if a dataset with the same pid exists already in SciCat.
+    """
+    if ingest_config.check_if_dataset_exists_by_pid and (local_dataset.pid is not None):
+        logger.info(
+            "Checking if dataset with pid %s already exists.", local_dataset.pid
+        )
+        return check_dataset_by_pid(
+            pid=local_dataset.pid, config=scicat_config, logger=logger
+        )
+
+    # Other cases, assuming dataset does not exist
+    return False
+
+
+def _check_if_dataset_exists_by_metadata(
+    local_dataset: ScicatDataset,
+    ingest_config: IngestionOptions,
+    scicat_config: SciCatOptions,
+    logger: logging.Logger,
+):
+    """
+    Check if a dataset already exists in SciCat where
+    the metadata key specified has the same value as the dataset that we want to create
+    """
+    if ingest_config.check_if_dataset_exists_by_metadata:
+        metadata_key = ingest_config.check_if_dataset_exists_by_metadata_key
+        target_metadata: dict = local_dataset.scientificMetadata.get(metadata_key, {})
+        metadata_value = target_metadata.get("value")
+
+        if metadata_value is not None:
+            logger.info(
+                "Checking if dataset with scientific metadata key %s "
+                "set to value %s already exists.",
+                metadata_key,
+                metadata_value,
+            )
+            return check_dataset_by_metadata(
+                metadata_key=metadata_key,
+                metadata_value=metadata_value,
+                config=scicat_config,
+                logger=logger,
+            )
+        else:
+            logger.info(
+                "No value found for metadata key %s specified for checking dataset.",
+                metadata_key,
+            )
+    else:
+        logger.info("No metadata key specified for checking dataset existence.")
+
+    # Other cases, assuming dataset does not exist
+    return False
+
+
 def main() -> None:
     """Main entry point of the app."""
     config = build_offline_config()
@@ -84,21 +154,29 @@ def main() -> None:
 
         # Prepare scicat dataset instance(entry)
         logger.info("Preparing scicat dataset instance ...")
-        local_dataset = scicat_dataset_to_dict(
-            create_scicat_dataset_instance(
-                metadata_schema_id=metadata_schema["id"],
-                metadata_schemas=metadata_schema["schemas"],
-                variable_map=variable_map,
-                data_file_list=data_file_list,
-                config=config.dataset,
-                logger=logger,
-            )
+        local_dataset_instance = create_scicat_dataset_instance(
+            metadata_schema_id=metadata_schema["id"],
+            metadata_schemas=metadata_schema["schemas"],
+            variable_map=variable_map,
+            data_file_list=data_file_list,
+            config=config.dataset,
+            logger=logger,
         )
+        # Check if dataset already exists in SciCat
+        if _check_if_dataset_exists_by_pid(
+            local_dataset_instance, config.ingestion, config.scicat, logger
+        ) or _check_if_dataset_exists_by_metadata(
+            local_dataset_instance, config.ingestion, config.scicat, logger
+        ):
+            logger.warning(
+                "Dataset with pid %s already present in SciCat. Skipping it!!!",
+                local_dataset_instance.pid,
+            )
+            exit(logger, unexpected=False)
+
+        # If dataset does not exist, continue with the creation of the dataset
+        local_dataset = scicat_dataset_to_dict(local_dataset_instance)
         logger.debug("Scicat dataset: %s", local_dataset)
-        # Create dataset in scicat
-        scicat_dataset = create_scicat_dataset(
-            dataset=local_dataset, config=config.scicat, logger=logger
-        )
 
         # Prepare origdatablock
         logger.info("Preparing scicat origdatablock instance ...")
@@ -110,17 +188,35 @@ def main() -> None:
             )
         )
         logger.debug("Scicat origdatablock: %s", local_origdatablock)
-        # create origdatablock in scicat
-        scicat_origdatablock = create_scicat_origdatablock(
-            origdatablock=local_origdatablock, config=config.scicat, logger=logger
-        )
 
-        # check one more time if we successfully created the entries in scicat
-        if not ((len(scicat_dataset) > 0) and (len(scicat_origdatablock) > 0)):
-            logger.error(
-                "Failed to create dataset or origdatablock in scicat.\n"
-                "SciCat dataset: %s\nSciCat origdatablock: %s",
-                scicat_dataset,
-                scicat_origdatablock,
+        # Create dataset in scicat
+        if config.ingestion.dry_run:
+            logger.info(
+                "Dry run mode. Skipping Scicat API calls for creating dataset ..."
+            )
+            exit(logger, unexpected=False)
+        else:
+            scicat_dataset = create_scicat_dataset(
+                dataset=local_dataset, config=config.scicat, logger=logger
+            )
+
+            # create origdatablock in scicat
+            scicat_origdatablock = create_scicat_origdatablock(
+                origdatablock=local_origdatablock, config=config.scicat, logger=logger
+            )
+
+            # check one more time if we successfully created the entries in scicat
+            if not ((len(scicat_dataset) > 0) and (len(scicat_origdatablock) > 0)):
+                logger.error(
+                    "Failed to create dataset or origdatablock in scicat.\n"
+                    "SciCat dataset: %s\nSciCat origdatablock: %s",
+                    scicat_dataset,
+                    scicat_origdatablock,
+                )
+                raise RuntimeError("Failed to create dataset or origdatablock.")
+
+            # check one more time if we successfully created the entries in scicat
+            exit(
+                logger,
+                unexpected=not (bool(scicat_dataset) and bool(scicat_origdatablock)),
             )
-            raise RuntimeError("Failed to create dataset or origdatablock.")