Merge pull request #57 from SciCatProject/data-preparation

Data preparation functionalities
SciCatProject · Aug 7, 2024 · 4dc97d4 · 4dc97d4
2 parents e8d5605 + 14d4682
commit 4dc97d4
Show file tree

Hide file tree

Showing 26 changed files with 1,152 additions and 1,204 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/README.md b/README.md
@@ -87,3 +87,49 @@ copier update
 `tox` controls virtual environment and commands for various purposes.
 Developers and CI actions can use the command.
 For example, `tox -e docs` builds documentation under `./html` directory and `tox -e py310` will run unit tests with python version `3.10`.
+
+## ADR
+(Architecture Decision Records)
+
+### ADR-001: Use ``dataclass`` instead of ``jinja`` or ``dict`` to create dataset/data-block instances.
+We need a dict-like template to create dataset/data-block instances via scicat APIs.
+#### Reason for not using ``dict``
+It used to be implemented with ``dict`` but it didn't have any verifying layer so anyone could easily break the instances without noticing or causing errors in the upstream layers.
+#### Reason for not using ``jinja``
+
+``Jinja`` template could handle a bit more complicated logic within the template, i.e. ``for`` loop or ``if`` statement could be applied to the variables.
+However, the dataset/data-block instances are not complicated to utilize these features of ``jinja``.
+#### Reason for using ``dataclasses.dataclass`
+First we did try using ``jinja`` but the dataset/data-block instances are simple enough so we replaced ``jinja`` template with ``dataclass``.
+``dataclass`` can verify name and type (if we use static checks) of each field.
+It can be easily turned into a nested dictionary using ``dataclasses.asdict`` function.
+
+#### Downside of using ``dataclass`` instead of ``jinja``
+With ``jinja`` template, certain fields could be skipped based on a variable.
+However, it is not possible in the dataclass so it will need extra handling after turning it to a dictionary.
+For example, each datafile item can have ``chk`` field, but this field shouldn't exist if checksum was not derived.
+With jinja template we could handle this like below
+```jinja
+{
+    "path": "{{ path }}",
+    "size": {{ size }},
+    "time": "{{ time }}",
+    {% if chk %}"chk": "{{ chk }}"{% endif %}
+}
+```
+However, with dataclass this should be handled like below.
+```python
+from dataclasses import dataclass, asdict
+@dataclass
+class DataFileItem:
+    path: str
+    size: int
+    time: str
+    chk: None | str = None
+
+data_file_item = {
+    k: v
+    for k, v in asdict(DataFileItem('./', 1, '00:00')).items()
+    if (k!='chk' or v is not None)
+}
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,6 @@ dependencies = [
   "ess-streaming-data-types",
   "graypy",
   "h5py",
-  "jinja2",
   "kafka-python",
   "requests",
   "rich"
@@ -46,8 +45,8 @@ dynamic = ["version"]
 "Source" = "https://github.com/ScicatProject/scicat-filewriter-ingest"
 
 [project.scripts]
-scicat_ingestor = "scicat_ingestor:main"
-background_ingestor = "background_ingestor:main"
+scicat_ingestor = "scicat_online_ingestor:main"
+background_ingestor = "scicat_offline_ingestor:main"
 
 [project.entry-points."scicat_ingestor.metadata_extractor"]
 max = "numpy:max"

diff --git a/resources/base.imsc.json.example b/resources/base.imsc.json.example
@@ -3,7 +3,7 @@
   "name" : "Generic metadata schema"
   "instrument" : "",
   "selector" : "filename:starts_with:/ess/data",
- "variables" : {
+  "variables" : {
     "pid": {
       "source": "NXS",
       "path": "/entry/entry_identifier_uuid",

diff --git a/resources/config.sample.json b/resources/config.sample.json
@@ -1,81 +1,71 @@
 {
+  "config_file": "config.json",
+  "id": "",
+  "dataset": {
+    "check_by_job_id": true,
+    "allow_dataset_pid": true,
+    "generate_dataset_pid": false,
+    "dataset_pid_prefix": "20.500.12269",
+    "default_instrument_id": "ID_OF_FALLBACK_INSTRUMENT",
+    "default_proposal_id": "DEFAULT_PROPOSAL_ID",
+    "default_owner_group": "DEFAULT_OWNER_GROUP",
+    "default_access_groups": [
+      "ACCESS_GROUP_1"
+    ]
+  },
+  "ingestion": {
+    "dry_run": false,
+    "offline_ingestor_executable" : "./scicat_offline_ingestor.py",
+    "schemas_directory": "schemas",
+    "file_handling": {
+      "compute_file_stats": true,
+      "compute_file_hash": true,
+      "file_hash_algorithm": "blake2b",
+      "save_file_hash": true,
+      "hash_file_extension": "b2b",
+      "ingestor_files_directory": "../ingestor",
+      "message_to_file": true,
+      "message_file_extension": "message.json"
+    }
+  },
   "kafka": {
-    "topics": ["KAFKA_TOPIC_1", "KAFKA_TOPIC_2"],
+    "topics": [
+      "KAFKA_TOPIC_1",
+      "KAFKA_TOPIC_2"
+    ],
     "group_id": "GROUP_ID",
-    "bootstrap_servers": ["localhost:9093"],
+    "bootstrap_servers": [
+      "localhost:9093"
+    ],
     "sasl_mechanism": "SCRAM-SHA-256",
     "sasl_username": "USERNAME",
     "sasl_password": "PASSWORD",
     "ssl_ca_location": "FULL_PATH_TO_CERTIFICATE_FILE",
     "individual_message_commit": true,
     "enable_auto_commit": true,
-    "auto_offset_reset": "earliest",
-    "message_saving_options": {
-      "message_to_file": true,
-      "message_file_extension": "message.json",
-      "message_output": "SOURCE_FOLDER"
-    }
-  },
-  "user_office": {
-    "host": "https://useroffice.host",
-    "username": "USERNAME",
-    "password": "PASSWORD",
-    "token": "JWT_TOKEN"
-  },
-  "scicat": {
-    "host": "https://scicat.host",
-    "username": "USERNAME",
-    "password": "PASSWORD",
-    "token": "JWT_TOKEN"
-  },
-  "graylog": {"host": "", "port": "", "facility": "scicat.ingestor"},
-  "dataset": {
-    "instrument_id": "ID_OF_FALLBACK_INSTRUMENT",
-    "instrument": "FALLBACK_INSTRUMENT_NAME",
-    "default_proposal_id": "DEFAULT_PROPOSAL_ID",
-    "ownable": {
-        "ownerGroup": "DEFAULT_OWNER_GROUP",
-        "accessGroups": ["ACCESS_GROUP_1"]
-    }
+    "auto_offset_reset": "earliest"
   },
-  "options": {
-    "config_file": "config.json",
+  "logging": {
     "verbose": false,
     "file_log": false,
     "file_log_base_name": "scicat_ingestor_log",
     "file_log_timestamp": false,
     "logging_level": "INFO",
+    "log_message_prefix": "SFI",
     "system_log": false,
     "system_log_facility": "mail",
-    "log_message_prefix": "SFI",
-    "check_by_job_id": true,
-    "pyscicat": null,
-    "graylog": false
-    },
-  "ingestion_options": {
-    "dry_run": false,
-    "schemas_directory": "schemas",
-    "retrieve_instrument_from": "default",
-    "instrument_position_in_file_path": 3,
-    "file_handling_options": {
-      "hdf_structure_in_metadata": false,
-      "hdf_structure_to_file": true,
-      "hdf_structure_file_extension": ".hdf_structure.json",
-      "hdf_structure_output": "SOURCE_FOLDER",
-      "local_output_directory": "data",
-      "compute_file_stats": true,
-      "compute_file_hash": true,
-      "file_hash_algorithm": "blake2b",
-      "save_file_hash": true,
-      "hash_file_extension": "b2b",
-      "ingestor_files_directory": "ingestor"
-    },
-    "dataset_options": {
-      "force_dataset_pid": true,
-      "dataset_pid_prefix": "20.500.12269",
-      "use_job_id_as_dataset_id": true,
-      "beautify_metadata_keys": false,
-      "metadata_levels_separator": " "
-    }
+    "graylog": false,
+    "graylog_host": "",
+    "graylog_port": "",
+    "graylog_facility": "scicat.ingestor"
+  },
+  "scicat": {
+    "host": "https://scicat.host",
+    "token": "JWT_TOKEN",
+    "headers": {},
+    "timeout": 0,
+    "stream": true,
+    "verify": false
   }
 }
+