Skip to content

Commit

Permalink
Merge pull request #57 from SciCatProject/data-preparation
Browse files Browse the repository at this point in the history
Data preparation functionalities
  • Loading branch information
YooSunYoung authored Aug 7, 2024
2 parents e8d5605 + 14d4682 commit 4dc97d4
Show file tree
Hide file tree
Showing 26 changed files with 1,152 additions and 1,204 deletions.
2 changes: 0 additions & 2 deletions MANIFEST.in

This file was deleted.

46 changes: 46 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,49 @@ copier update
`tox` controls virtual environment and commands for various purposes.
Developers and CI actions can use the command.
For example, `tox -e docs` builds documentation under `./html` directory and `tox -e py310` will run unit tests with python version `3.10`.

## ADR
(Architecture Decision Records)

### ADR-001: Use ``dataclass`` instead of ``jinja`` or ``dict`` to create dataset/data-block instances.
We need a dict-like template to create dataset/data-block instances via scicat APIs.
#### Reason for not using ``dict``
It used to be implemented with ``dict`` but it didn't have any verifying layer so anyone could easily break the instances without noticing or causing errors in the upstream layers.
#### Reason for not using ``jinja``

``Jinja`` template could handle a bit more complicated logic within the template, i.e. ``for`` loop or ``if`` statement could be applied to the variables.
However, the dataset/data-block instances are not complicated to utilize these features of ``jinja``.
#### Reason for using ``dataclasses.dataclass`
First we did try using ``jinja`` but the dataset/data-block instances are simple enough so we replaced ``jinja`` template with ``dataclass``.
``dataclass`` can verify name and type (if we use static checks) of each field.
It can be easily turned into a nested dictionary using ``dataclasses.asdict`` function.

#### Downside of using ``dataclass`` instead of ``jinja``
With ``jinja`` template, certain fields could be skipped based on a variable.
However, it is not possible in the dataclass so it will need extra handling after turning it to a dictionary.
For example, each datafile item can have ``chk`` field, but this field shouldn't exist if checksum was not derived.
With jinja template we could handle this like below
```jinja
{
"path": "{{ path }}",
"size": {{ size }},
"time": "{{ time }}",
{% if chk %}"chk": "{{ chk }}"{% endif %}
}
```
However, with dataclass this should be handled like below.
```python
from dataclasses import dataclass, asdict
@dataclass
class DataFileItem:
path: str
size: int
time: str
chk: None | str = None

data_file_item = {
k: v
for k, v in asdict(DataFileItem('./', 1, '00:00')).items()
if (k!='chk' or v is not None)
}
```
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ dependencies = [
"ess-streaming-data-types",
"graypy",
"h5py",
"jinja2",
"kafka-python",
"requests",
"rich"
Expand All @@ -46,8 +45,8 @@ dynamic = ["version"]
"Source" = "https://github.com/ScicatProject/scicat-filewriter-ingest"

[project.scripts]
scicat_ingestor = "scicat_ingestor:main"
background_ingestor = "background_ingestor:main"
scicat_ingestor = "scicat_online_ingestor:main"
background_ingestor = "scicat_offline_ingestor:main"

[project.entry-points."scicat_ingestor.metadata_extractor"]
max = "numpy:max"
Expand Down
2 changes: 1 addition & 1 deletion resources/base.imsc.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"name" : "Generic metadata schema"
"instrument" : "",
"selector" : "filename:starts_with:/ess/data",
"variables" : {
"variables" : {
"pid": {
"source": "NXS",
"path": "/entry/entry_identifier_uuid",
Expand Down
114 changes: 52 additions & 62 deletions resources/config.sample.json
Original file line number Diff line number Diff line change
@@ -1,81 +1,71 @@
{
"config_file": "config.json",
"id": "",
"dataset": {
"check_by_job_id": true,
"allow_dataset_pid": true,
"generate_dataset_pid": false,
"dataset_pid_prefix": "20.500.12269",
"default_instrument_id": "ID_OF_FALLBACK_INSTRUMENT",
"default_proposal_id": "DEFAULT_PROPOSAL_ID",
"default_owner_group": "DEFAULT_OWNER_GROUP",
"default_access_groups": [
"ACCESS_GROUP_1"
]
},
"ingestion": {
"dry_run": false,
"offline_ingestor_executable" : "./scicat_offline_ingestor.py",
"schemas_directory": "schemas",
"file_handling": {
"compute_file_stats": true,
"compute_file_hash": true,
"file_hash_algorithm": "blake2b",
"save_file_hash": true,
"hash_file_extension": "b2b",
"ingestor_files_directory": "../ingestor",
"message_to_file": true,
"message_file_extension": "message.json"
}
},
"kafka": {
"topics": ["KAFKA_TOPIC_1", "KAFKA_TOPIC_2"],
"topics": [
"KAFKA_TOPIC_1",
"KAFKA_TOPIC_2"
],
"group_id": "GROUP_ID",
"bootstrap_servers": ["localhost:9093"],
"bootstrap_servers": [
"localhost:9093"
],
"sasl_mechanism": "SCRAM-SHA-256",
"sasl_username": "USERNAME",
"sasl_password": "PASSWORD",
"ssl_ca_location": "FULL_PATH_TO_CERTIFICATE_FILE",
"individual_message_commit": true,
"enable_auto_commit": true,
"auto_offset_reset": "earliest",
"message_saving_options": {
"message_to_file": true,
"message_file_extension": "message.json",
"message_output": "SOURCE_FOLDER"
}
},
"user_office": {
"host": "https://useroffice.host",
"username": "USERNAME",
"password": "PASSWORD",
"token": "JWT_TOKEN"
},
"scicat": {
"host": "https://scicat.host",
"username": "USERNAME",
"password": "PASSWORD",
"token": "JWT_TOKEN"
},
"graylog": {"host": "", "port": "", "facility": "scicat.ingestor"},
"dataset": {
"instrument_id": "ID_OF_FALLBACK_INSTRUMENT",
"instrument": "FALLBACK_INSTRUMENT_NAME",
"default_proposal_id": "DEFAULT_PROPOSAL_ID",
"ownable": {
"ownerGroup": "DEFAULT_OWNER_GROUP",
"accessGroups": ["ACCESS_GROUP_1"]
}
"auto_offset_reset": "earliest"
},
"options": {
"config_file": "config.json",
"logging": {
"verbose": false,
"file_log": false,
"file_log_base_name": "scicat_ingestor_log",
"file_log_timestamp": false,
"logging_level": "INFO",
"log_message_prefix": "SFI",
"system_log": false,
"system_log_facility": "mail",
"log_message_prefix": "SFI",
"check_by_job_id": true,
"pyscicat": null,
"graylog": false
},
"ingestion_options": {
"dry_run": false,
"schemas_directory": "schemas",
"retrieve_instrument_from": "default",
"instrument_position_in_file_path": 3,
"file_handling_options": {
"hdf_structure_in_metadata": false,
"hdf_structure_to_file": true,
"hdf_structure_file_extension": ".hdf_structure.json",
"hdf_structure_output": "SOURCE_FOLDER",
"local_output_directory": "data",
"compute_file_stats": true,
"compute_file_hash": true,
"file_hash_algorithm": "blake2b",
"save_file_hash": true,
"hash_file_extension": "b2b",
"ingestor_files_directory": "ingestor"
},
"dataset_options": {
"force_dataset_pid": true,
"dataset_pid_prefix": "20.500.12269",
"use_job_id_as_dataset_id": true,
"beautify_metadata_keys": false,
"metadata_levels_separator": " "
}
"graylog": false,
"graylog_host": "",
"graylog_port": "",
"graylog_facility": "scicat.ingestor"
},
"scicat": {
"host": "https://scicat.host",
"token": "JWT_TOKEN",
"headers": {},
"timeout": 0,
"stream": true,
"verify": false
}
}

Loading

0 comments on commit 4dc97d4

Please sign in to comment.