From f14db560e21abab7a55d71876e3c8c7ed6af3748 Mon Sep 17 00:00:00 2001 From: Markus Binsteiner Date: Thu, 7 Mar 2024 16:11:02 +0100 Subject: [PATCH] feat: add zenodo download module --- pyproject.toml | 4 +- .../onboarding/modules/__init__.py | 29 +++- .../onboarding/modules/download/__init__.py | 13 +- .../onboarding/modules/download/github.py | 13 +- .../onboarding/modules/download/zenodo.py | 164 ++++++++++++++++++ 5 files changed, 210 insertions(+), 13 deletions(-) create mode 100644 src/kiara_plugin/onboarding/modules/download/zenodo.py diff --git a/pyproject.toml b/pyproject.toml index f01b221..44f71bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -244,6 +244,8 @@ plugins = [ # mypy per-module options: [[tool.mypy.overrides]] module = [ - "pyzenodo3.*" + "patoolib.*", + "pyzenodo3.*", + "ruamel.*", ] ignore_missing_imports = true diff --git a/src/kiara_plugin/onboarding/modules/__init__.py b/src/kiara_plugin/onboarding/modules/__init__.py index 7de192e..fadce80 100644 --- a/src/kiara_plugin/onboarding/modules/__init__.py +++ b/src/kiara_plugin/onboarding/modules/__init__.py @@ -232,7 +232,7 @@ def create_outputs_schema( def process(self, inputs: ValueMap, outputs: ValueMap): - from kiara.models.filesystem import FolderImportConfig + from kiara.models.filesystem import FolderImportConfig, KiaraFileBundle bundle_name = self.get_config_value("result_bundle_name") if bundle_name is None: @@ -280,20 +280,36 @@ def process(self, inputs: ValueMap, outputs: ValueMap): if attach_metadata_to_files is None: attach_metadata_to_files = inputs.get_value_data("attach_metadata_to_files") - archive = self.retrieve_archive(inputs=inputs) - result = self.extract_archive( - archive_file=archive, + archive = self.retrieve_archive( + inputs=inputs, bundle_name=bundle_name, attach_metadata_to_bundle=attach_metadata_to_bundle, attach_metadata_to_files=attach_metadata_to_files, import_config=import_config, ) + if isinstance(archive, KiaraFileBundle): + result = archive + else: + result = self.extract_archive( + archive_file=archive, + bundle_name=bundle_name, + attach_metadata_to_bundle=attach_metadata_to_bundle, + attach_metadata_to_files=attach_metadata_to_files, + import_config=import_config, + ) outputs.set_value("file_bundle", result) @abc.abstractmethod - def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile": - pass + def retrieve_archive( + self, + inputs: ValueMap, + bundle_name: Union[str, None], + attach_metadata_to_bundle: bool, + attach_metadata_to_files: bool, + import_config: "FolderImportConfig", + ) -> Union["KiaraFile", "KiaraFileBundle"]: + """Retrieve an archive file, or the actual result file bundle.""" def extract_archive( self, @@ -303,6 +319,7 @@ def extract_archive( attach_metadata_to_files: bool, import_config: "FolderImportConfig", ) -> "KiaraFileBundle": + """Extract the archive file that was returned in 'retrieve_archive'.""" from kiara.models.filesystem import KiaraFileBundle diff --git a/src/kiara_plugin/onboarding/modules/download/__init__.py b/src/kiara_plugin/onboarding/modules/download/__init__.py index 55005a6..56e811c 100644 --- a/src/kiara_plugin/onboarding/modules/download/__init__.py +++ b/src/kiara_plugin/onboarding/modules/download/__init__.py @@ -8,7 +8,7 @@ from kiara_plugin.onboarding.modules import OnboardFileBundleModule, OnboardFileModule if TYPE_CHECKING: - from kiara.models.filesystem import KiaraFile + from kiara.models.filesystem import FolderImportConfig, KiaraFile, KiaraFileBundle class DownloadFileModule(OnboardFileModule): @@ -65,7 +65,14 @@ def create_onboard_inputs_schema(self) -> Dict[str, Any]: return result - def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile": + def retrieve_archive( + self, + inputs: ValueMap, + bundle_name: Union[str, None], + attach_metadata_to_bundle: bool, + attach_metadata_to_files: bool, + import_config: "FolderImportConfig", + ) -> Union["KiaraFile", "KiaraFileBundle"]: from urllib.parse import urlparse @@ -87,7 +94,7 @@ def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile": kiara_file: KiaraFile - kiara_file = download_file( + kiara_file = download_file( # type: ignore url, target=tmp_file.name, attach_metadata=True, return_md5_hash=False ) diff --git a/src/kiara_plugin/onboarding/modules/download/github.py b/src/kiara_plugin/onboarding/modules/download/github.py index 198b50c..8ca466e 100644 --- a/src/kiara_plugin/onboarding/modules/download/github.py +++ b/src/kiara_plugin/onboarding/modules/download/github.py @@ -5,7 +5,7 @@ from kiara_plugin.onboarding.modules import OnboardFileBundleModule, OnboardFileModule if TYPE_CHECKING: - from kiara.models.filesystem import KiaraFile + from kiara.models.filesystem import FolderImportConfig, KiaraFile, KiaraFileBundle class DownloadGithubFileModule(OnboardFileModule): @@ -49,7 +49,7 @@ def retrieve_file( return result_file -class DownloadFileBundleModule(OnboardFileBundleModule): +class DownloadGithbFileBundleModule(OnboardFileBundleModule): """Download a file bundle from a remote github repository. If 'sub_path' is not specified, the whole repo will be used. @@ -70,7 +70,14 @@ def create_onboard_inputs_schema(self) -> Dict[str, Any]: } return result - def retrieve_archive(self, inputs: ValueMap) -> "KiaraFile": + def retrieve_archive( + self, + inputs: ValueMap, + bundle_name: Union[str, None], + attach_metadata_to_bundle: bool, + attach_metadata_to_files: bool, + import_config: "FolderImportConfig", + ) -> Union["KiaraFile", "KiaraFileBundle"]: from kiara_plugin.onboarding.utils.download import download_file diff --git a/src/kiara_plugin/onboarding/modules/download/zenodo.py b/src/kiara_plugin/onboarding/modules/download/zenodo.py new file mode 100644 index 0000000..59c60c1 --- /dev/null +++ b/src/kiara_plugin/onboarding/modules/download/zenodo.py @@ -0,0 +1,164 @@ +# -*- coding: utf-8 -*- +from typing import TYPE_CHECKING, Any, Dict, Union + +from kiara.exceptions import KiaraException +from kiara.models.values.value import ValueMap +from kiara_plugin.onboarding.modules import OnboardFileBundleModule, OnboardFileModule + +if TYPE_CHECKING: + from kiara.models.filesystem import FolderImportConfig, KiaraFile, KiaraFileBundle + + +class DownloadZenodoFileModule(OnboardFileModule): + """Download a single file from a github repo.""" + + _module_type_name = "download.file.from.zenodo" + + def create_onboard_inputs_schema(self) -> Dict[str, Any]: + + result: Dict[str, Dict[str, Any]] = { + "doi": {"type": "string", "doc": "The DOI."}, + "path": { + "type": "string", + "doc": "The path to the file/file name within the dataset.", + "optional": True, + }, + } + return result + + def retrieve_file( + self, inputs: ValueMap, file_name: Union[str, None], attach_metadata: bool + ) -> Any: + + import pyzenodo3 + + from kiara_plugin.onboarding.utils.download import download_file + + doi = inputs.get_value_data("doi") + file_path = inputs.get_value_data("path") + + if "/zenodo." not in doi: + doi = f"10.5281/zenodo.{doi}" + + zen = pyzenodo3.Zenodo() + record = zen.find_record_by_doi(doi) + + match = None + for _available_file in record.data["files"]: + if file_path == _available_file["key"]: + match = _available_file + break + + if not match: + msg = "Available files:\n" + for key in record.data["files"]: + msg += f" - {key['key']}\n" + raise KiaraException( + msg=f"Can't find file '{file_path}' in Zenodo record. {msg}" + ) + + url = match["links"]["self"] + checksum = match["checksum"][4:] + + file_name = file_path.split("/")[-1] + + result_file: KiaraFile + result_file, result_checksum = download_file( # type: ignore + url=url, + file_name=file_name, + attach_metadata=attach_metadata, + return_md5_hash=True, + ) + + if checksum != result_checksum: + raise KiaraException( + msg=f"Can't download file '{file_name}' from zenodo, invalid checksum: {checksum} != {checksum}" + ) + + if attach_metadata: + result_file.metadata["zenodo_record_data"] = record.data + + return result_file + + +class DownloadZenodoFileBundleModule(OnboardFileBundleModule): + """Download a file bundle from a remote github repository. + + If 'sub_path' is not specified, the whole repo will be used. + + """ + + _module_type_name = "download.file_bundle.from.zenodo" + + def create_onboard_inputs_schema(self) -> Dict[str, Any]: + result: Dict[str, Dict[str, Any]] = { + "doi": {"type": "string", "doc": "The DOI."}, + } + return result + + def retrieve_archive( + self, + inputs: ValueMap, + bundle_name: Union[str, None], + attach_metadata_to_bundle: bool, + attach_metadata_to_files: bool, + import_config: "FolderImportConfig", + ) -> Union["KiaraFile", "KiaraFileBundle"]: + + import pyzenodo3 + + from kiara.models.filesystem import KiaraFile, KiaraFileBundle + from kiara_plugin.onboarding.utils.download import download_file + + doi = inputs.get_value_data("doi") + + if "/zenodo." not in doi: + doi = f"10.5281/zenodo.{doi}" + + zen = pyzenodo3.Zenodo() + record = zen.find_record_by_doi(doi) + + base_path = KiaraFileBundle.create_tmp_dir() + + for _available_file in record.data["files"]: + match = _available_file + + url = match["links"]["self"] + checksum = match["checksum"][4:] + + file_path = _available_file["key"] + full_path = base_path / file_path + + file_name = file_path.split("/")[-1] + + # TODO: filter here already, so we don't need to download files we don't want + + result_file: KiaraFile + result_file, result_checksum = download_file( # type: ignore + url=url, + target=full_path.as_posix(), + file_name=file_name, + attach_metadata=True, + return_md5_hash=True, + ) + + if checksum != result_checksum: + raise KiaraException( + msg=f"Can't download file '{file_name}' from zenodo, invalid checksum: {checksum} != {result_checksum}" + ) + + if not bundle_name: + bundle_name = doi + result = KiaraFileBundle.import_folder( + source=base_path.as_posix(), + bundle_name=bundle_name, + import_config=import_config, + ) + if attach_metadata_to_bundle: + result.metadata["zenodo_record_data"] = record.data + + if attach_metadata_to_files: + for file in result.included_files.values(): + file.metadata["zenodo_record_data"] = record.data + + return result