From ca734f9b4846e8ceec37f55a8b0593546a4d3394 Mon Sep 17 00:00:00 2001 From: Markus Binsteiner Date: Mon, 8 Apr 2024 22:27:38 +0200 Subject: [PATCH] fix: zenodo bundle download --- .../onboarding/modules/download/zenodo.py | 90 +++++++-------- src/kiara_plugin/onboarding/modules/zenodo.py | 106 ------------------ src/kiara_plugin/onboarding/utils/download.py | 71 +++++++++++- 3 files changed, 108 insertions(+), 159 deletions(-) delete mode 100644 src/kiara_plugin/onboarding/modules/zenodo.py diff --git a/src/kiara_plugin/onboarding/modules/download/zenodo.py b/src/kiara_plugin/onboarding/modules/download/zenodo.py index 59c60c1..b5e3723 100644 --- a/src/kiara_plugin/onboarding/modules/download/zenodo.py +++ b/src/kiara_plugin/onboarding/modules/download/zenodo.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- from typing import TYPE_CHECKING, Any, Dict, Union +from kiara.api import ValueMap from kiara.exceptions import KiaraException -from kiara.models.values.value import ValueMap from kiara_plugin.onboarding.modules import OnboardFileBundleModule, OnboardFileModule if TYPE_CHECKING: @@ -10,7 +10,7 @@ class DownloadZenodoFileModule(OnboardFileModule): - """Download a single file from a github repo.""" + """Download a single file from a Zenodo record.""" _module_type_name = "download.file.from.zenodo" @@ -18,6 +18,11 @@ def create_onboard_inputs_schema(self) -> Dict[str, Any]: result: Dict[str, Dict[str, Any]] = { "doi": {"type": "string", "doc": "The DOI."}, + "version": { + "type": "string", + "doc": "The version of the record to download.", + "optional": True, + }, "path": { "type": "string", "doc": "The path to the file/file name within the dataset.", @@ -35,6 +40,13 @@ def retrieve_file( from kiara_plugin.onboarding.utils.download import download_file doi = inputs.get_value_data("doi") + + version = inputs.get_value_data("version") + if version: + raise NotImplementedError( + "Downloading versioned records is not yet supported." + ) + file_path = inputs.get_value_data("path") if "/zenodo." not in doi: @@ -43,6 +55,18 @@ def retrieve_file( zen = pyzenodo3.Zenodo() record = zen.find_record_by_doi(doi) + if not file_path: + if len(record.data["files"]) == 1: + file_path = record.data["files"][0]["key"] + else: + msg = "Available files:\n" + for key in record.data["files"]: + msg += f" - {key['key']}\n" + + raise KiaraException( + msg=f"Multiple files available in Zenodo record, please specify 'path' input.\n\n{msg}" + ) + match = None for _available_file in record.data["files"]: if file_path == _available_file["key"]: @@ -93,6 +117,11 @@ class DownloadZenodoFileBundleModule(OnboardFileBundleModule): def create_onboard_inputs_schema(self) -> Dict[str, Any]: result: Dict[str, Dict[str, Any]] = { "doi": {"type": "string", "doc": "The DOI."}, + "version": { + "type": "string", + "doc": "The version of the record to download. By default, the latest version will be used.", + "optional": True, + }, } return result @@ -105,60 +134,17 @@ def retrieve_archive( import_config: "FolderImportConfig", ) -> Union["KiaraFile", "KiaraFileBundle"]: - import pyzenodo3 - - from kiara.models.filesystem import KiaraFile, KiaraFileBundle - from kiara_plugin.onboarding.utils.download import download_file + from kiara_plugin.onboarding.utils.download import download_zenodo_file_bundle doi = inputs.get_value_data("doi") + version = inputs.get_value_data("version") - if "/zenodo." not in doi: - doi = f"10.5281/zenodo.{doi}" - - zen = pyzenodo3.Zenodo() - record = zen.find_record_by_doi(doi) - - base_path = KiaraFileBundle.create_tmp_dir() - - for _available_file in record.data["files"]: - match = _available_file - - url = match["links"]["self"] - checksum = match["checksum"][4:] - - file_path = _available_file["key"] - full_path = base_path / file_path - - file_name = file_path.split("/")[-1] - - # TODO: filter here already, so we don't need to download files we don't want - - result_file: KiaraFile - result_file, result_checksum = download_file( # type: ignore - url=url, - target=full_path.as_posix(), - file_name=file_name, - attach_metadata=True, - return_md5_hash=True, - ) - - if checksum != result_checksum: - raise KiaraException( - msg=f"Can't download file '{file_name}' from zenodo, invalid checksum: {checksum} != {result_checksum}" - ) - - if not bundle_name: - bundle_name = doi - result = KiaraFileBundle.import_folder( - source=base_path.as_posix(), + result = download_zenodo_file_bundle( + doi=doi, + version=version, + attach_metadata_to_bundle=attach_metadata_to_bundle, + attach_metadata_to_files=attach_metadata_to_files, bundle_name=bundle_name, import_config=import_config, ) - if attach_metadata_to_bundle: - result.metadata["zenodo_record_data"] = record.data - - if attach_metadata_to_files: - for file in result.included_files.values(): - file.metadata["zenodo_record_data"] = record.data - return result diff --git a/src/kiara_plugin/onboarding/modules/zenodo.py b/src/kiara_plugin/onboarding/modules/zenodo.py deleted file mode 100644 index b16fe71..0000000 --- a/src/kiara_plugin/onboarding/modules/zenodo.py +++ /dev/null @@ -1,106 +0,0 @@ -# -*- coding: utf-8 -*- -import hashlib -import shutil -from pathlib import Path -from typing import Any, Mapping - -import orjson -from pydantic import Field - -from kiara.api import KiaraModule, KiaraModuleConfig, ValueMap, ValueMapSchema -from kiara.exceptions import KiaraProcessingException -from kiara.models.filesystem import KiaraFileBundle - - -class ZenodoDownloadConfig(KiaraModuleConfig): - - metadata_filename: str = Field( - description="The filename for the zenodo metadata.", default="metadata.json" - ) - - -class ZenodoDownload(KiaraModule): - """Download a dataset from zenodo.org.""" - - _module_type_name = "onboard.zenodo_record" - _config_cls = ZenodoDownloadConfig - - def create_inputs_schema( - self, - ) -> ValueMapSchema: - - metadata_filename = self.get_config_value("metadata_filename") - return { - "doi": {"type": "string", "doc": "The doi of the record"}, - "include_metadata": { - "type": "boolean", - "doc": f"Whether to write the record metadata to a file '{metadata_filename}' and include it in the resulting file bundle.", - "default": True, - }, - } - - def create_outputs_schema( - self, - ) -> ValueMapSchema: - - return { - "file_bundle": { - "type": "file_bundle", - } - } - - def download_file(self, file_data: Mapping[str, Any], target_path: Path): - - import httpx - - url = file_data["links"]["self"] - file_name = file_data["key"] - checksum = file_data["checksum"][4:] - - target_file = target_path / file_name - - if target_file.exists(): - raise KiaraProcessingException( - f"Can't download file, target path already exists: {target_path.as_posix()}." - ) - - hash_md5 = hashlib.md5() # noqa - - with open(target_file, "ab") as file2: - with httpx.Client() as client: - with client.stream("GET", url) as resp: - for chunk in resp.iter_bytes(): - hash_md5.update(chunk) - file2.write(chunk) - - if checksum != hash_md5.hexdigest(): - raise KiaraProcessingException( - f"Can't downloda file '{file_name}', invalid checksum: {checksum} != {hash_md5.hexdigest()}" - ) - - return target_file - - def process(self, inputs: ValueMap, outputs: ValueMap): - - import pyzenodo3 - - include_metadata = inputs.get_value_data("include_metadata") - - doi = inputs.get_value_data("doi") - zen = pyzenodo3.Zenodo() - - record = zen.find_record_by_doi(doi) - - path = KiaraFileBundle.create_tmp_dir() - shutil.rmtree(path, ignore_errors=True) - path.mkdir() - for file_data in record.data["files"]: - self.download_file(file_data, path) - - if include_metadata: - metadata_filename = self.get_config_value("metadata_filename") - metadata_file = path / metadata_filename - metadata_file.write_bytes(orjson.dumps(record.data)) - - bundle = KiaraFileBundle.import_folder(path.as_posix()) - outputs.set_value("file_bundle", bundle) diff --git a/src/kiara_plugin/onboarding/utils/download.py b/src/kiara_plugin/onboarding/utils/download.py index fbc6070..7f24395 100644 --- a/src/kiara_plugin/onboarding/utils/download.py +++ b/src/kiara_plugin/onboarding/utils/download.py @@ -5,7 +5,7 @@ from datetime import datetime from functools import lru_cache from pathlib import Path -from typing import Dict, List, Mapping, Tuple, Type, Union +from typing import Any, Dict, List, Mapping, Tuple, Type, Union from pydantic import BaseModel, Field @@ -365,3 +365,72 @@ def onboard_file_bundle( imported_bundle = result return imported_bundle + + +def download_zenodo_file_bundle( + doi: str, + version: Union[None, str], + attach_metadata_to_bundle: bool, + attach_metadata_to_files: bool, + bundle_name: Union[str, None] = None, + import_config: Union[None, Mapping[str, Any], FolderImportConfig] = None, +) -> KiaraFileBundle: + + import pyzenodo3 + + from kiara.models.filesystem import KiaraFile, KiaraFileBundle + + if "/zenodo." not in doi: + doi = f"10.5281/zenodo.{doi}" + + zen = pyzenodo3.Zenodo() + + if version: + raise NotImplementedError("Downloading versioned records is not yet supported.") + + record = zen.find_record_by_doi(doi) + + base_path = KiaraFileBundle.create_tmp_dir() + + for _available_file in record.data["files"]: + match = _available_file + + url = match["links"]["self"] + checksum = match["checksum"][4:] + + file_path = _available_file["key"] + full_path = base_path / file_path + + file_name = file_path.split("/")[-1] + + # TODO: filter here already, so we don't need to download files we don't want + + result_file: KiaraFile + result_file, result_checksum = download_file( # type: ignore + url=url, + target=full_path.as_posix(), + file_name=file_name, + attach_metadata=True, + return_md5_hash=True, + ) + + if checksum != result_checksum: + raise KiaraException( + msg=f"Can't download file '{file_name}' from zenodo, invalid checksum: {checksum} != {result_checksum}" + ) + + if not bundle_name: + bundle_name = doi + result = KiaraFileBundle.import_folder( + source=base_path.as_posix(), + bundle_name=bundle_name, + import_config=import_config, + ) + if attach_metadata_to_bundle: + result.metadata["zenodo_record_data"] = record.data + + if attach_metadata_to_files: + for file in result.included_files.values(): + file.metadata["zenodo_record_data"] = record.data + + return result