From 041ccddafbc1c0390813fff94f126558f5f3c574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A5l=20R=C3=B8nning?= Date: Thu, 28 Nov 2024 16:22:55 +0100 Subject: [PATCH] [CDF-23393] HTTP download support (#1241) * HTTP download support * statement moved out from try/except --- .../sourcesystem/cdf_pi/module.toml | 6 +-- .../sourcesystem/cdf_sap_assets/module.toml | 7 +-- .../sourcesystem/cdf_sap_events/module.toml | 6 +-- .../sourcesystem/cdf_sharepoint/module.toml | 12 ++--- cognite_toolkit/_cdf_tk/commands/modules.py | 29 ++++++++---- cognite_toolkit/_cdf_tk/utils/repository.py | 46 ++++++++++++++++++- 6 files changed, 79 insertions(+), 27 deletions(-) diff --git a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_pi/module.toml b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_pi/module.toml index afb12594b..9ef7c8893 100644 --- a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_pi/module.toml +++ b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_pi/module.toml @@ -12,9 +12,9 @@ tags = [ modules = [] [[data]] -repoType = "GitHub" -repo = "cognitedata/toolkit-data" -source = "data/publicdata/pi_timeseries.Table.csv" +repoType = "http" +repo = "https://apps-cdn.cogniteapp.com/toolkit" +source = "publicdata/pi_timeseries.Table.csv" destination = "raw/timeseries.Table.csv" [[extra_resources]] diff --git a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_assets/module.toml b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_assets/module.toml index 22d20d860..609db5112 100644 --- a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_assets/module.toml +++ b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_assets/module.toml @@ -11,11 +11,12 @@ tags = [ modules = [] [[data]] -repoType = "GitHub" -repo = "cognitedata/toolkit-data" -source = "data/publicdata/assets.Table.csv" +repoType = "http" +repo = "https://apps-cdn.cogniteapp.com/toolkit" +source = "publicdata/assets.Table.csv" destination = "raw/dump.Table.csv" + [[extra_resources]] location = "cdf_common/data_sets/demo.DataSet.yaml" diff --git a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_events/module.toml b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_events/module.toml index 7bdad0c82..2f0c55700 100644 --- a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_events/module.toml +++ b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sap_events/module.toml @@ -8,9 +8,9 @@ tags = [ ] [[data]] -repoType = "GitHub" -repo = "cognitedata/toolkit-data" -source = "data/publicdata/[work]*.csv" +repoType = "http" +repo = "https://apps-cdn.cogniteapp.com/toolkit/publicdata" +source = "workitem.Table.csv;workorder.Table.csv;workpackage.Table.csv;worktask.Table.csv" destination = "raw/" [[extra_resources]] diff --git a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sharepoint/module.toml b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sharepoint/module.toml index 3422e5f87..1155b2b94 100644 --- a/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sharepoint/module.toml +++ b/cognite_toolkit/_builtin_modules/sourcesystem/cdf_sharepoint/module.toml @@ -8,15 +8,15 @@ tags = [ ] [[data]] -repoType = "GitHub" -repo = "cognitedata/toolkit-data" -source = "data/publicdata/files/*.pdf" +repoType = "http" +repo = "https://apps-cdn.cogniteapp.com/toolkit/publicdata/files" +source = "PH-25578-P-4110006-001.pdf;PH-25578-P-4110010-001.pdf;PH-25578-P-4110119-001.pdf;PH-ME-P-0003-001.pdf;PH-ME-P-0004-001.pdf;PH-ME-P-0151-001.pdf;PH-ME-P-0152-001.pdf;PH-ME-P-0153-001.pdf;PH-ME-P-0156-001.pdf;PH-ME-P-0156-002.pdf;PH-ME-P-0160-001.pdf;Processed-PH-25578-P-4110006-001.svg;Processed-PH-25578-P-4110010-001.svg;Processed-PH-ME-P-0152-001.svg;Processed-PH-ME-P-0153-001.svg;Processed-PH-ME-P-0156-001.svg;Processed-PH-ME-P-0156-002.svg;Processed-PH-ME-P-0160-001.svg" destination = "files/" [[data]] -repoType = "GitHub" -repo = "cognitedata/toolkit-data" -source = "data/publicdata/valhall_file_metadata.csv" +repoType = "http" +repo = "https://apps-cdn.cogniteapp.com/toolkit" +source = "publicdata/valhall_file_metadata.csv" destination = "raw/files.Table.csv" [[extra_resources]] diff --git a/cognite_toolkit/_cdf_tk/commands/modules.py b/cognite_toolkit/_cdf_tk/commands/modules.py index 929f4f06f..7f0ab1a80 100644 --- a/cognite_toolkit/_cdf_tk/commands/modules.py +++ b/cognite_toolkit/_cdf_tk/commands/modules.py @@ -51,7 +51,7 @@ from cognite_toolkit._cdf_tk.tk_warnings import MediumSeverityWarning from cognite_toolkit._cdf_tk.utils import humanize_collection, read_yaml_file from cognite_toolkit._cdf_tk.utils.modules import module_directory_from_path -from cognite_toolkit._cdf_tk.utils.repository import GitHubFileDownloader +from cognite_toolkit._cdf_tk.utils.repository import FileDownloader from cognite_toolkit._version import __version__ custom_style_fancy = questionary.Style( @@ -73,6 +73,12 @@ POINTER = INDENT + "▶" +_FILE_DOWNLOADERS_BY_TYPE: dict[str, type[FileDownloader]] = { + downloader_cls._type: downloader_cls # type: ignore + for downloader_cls in FileDownloader.__subclasses__() +} + + class ModulesCommand(ToolkitCommand): def __init__(self, print_warning: bool = True, skip_tracking: bool = False, silent: bool = False): super().__init__(print_warning, skip_tracking, silent) @@ -117,7 +123,8 @@ def _create( seen_modules: set[Path] = set() selected_paths: set[Path] = set() - downloader_by_repo: dict[str, GitHubFileDownloader] = {} + downloader_by_repo: dict[str, FileDownloader] = {} + extra_resources: set[Path] = set() for package_name, package in selected_packages.items(): print(f"{INDENT}[{'yellow' if mode == 'clean' else 'green'}]Creating {package_name}[/]") @@ -152,15 +159,17 @@ def _create( if module.definition is not None and download_data: for example_data in module.definition.data: - if example_data.repo_type.casefold() != "github": - self.warn( - MediumSeverityWarning( - f"Unsupported repo type for example data: {example_data.repo_type}" - ) - ) - continue if example_data.repo not in downloader_by_repo: - downloader_by_repo[example_data.repo] = GitHubFileDownloader(example_data.repo) + try: + downloader_cls = _FILE_DOWNLOADERS_BY_TYPE[example_data.repo_type] + except KeyError: + self.warn( + MediumSeverityWarning( + f"Unsupported repo type for example data: {example_data.repo_type}" + ) + ) + continue + downloader_by_repo[example_data.repo] = downloader_cls(example_data.repo) downloader = downloader_by_repo[example_data.repo] downloader.copy(example_data.source, target_dir / example_data.destination) diff --git a/cognite_toolkit/_cdf_tk/utils/repository.py b/cognite_toolkit/_cdf_tk/utils/repository.py index ef14d4507..f6bdd259c 100644 --- a/cognite_toolkit/_cdf_tk/utils/repository.py +++ b/cognite_toolkit/_cdf_tk/utils/repository.py @@ -1,8 +1,10 @@ import fnmatch +from abc import ABC, abstractmethod from collections.abc import Iterable from functools import lru_cache from pathlib import Path -from typing import Literal +from typing import ClassVar, Literal +from urllib.parse import urljoin import requests from requests import Response @@ -13,7 +15,21 @@ from cognite_toolkit._cdf_tk.tk_warnings import HTTPWarning -class GitHubFileDownloader: +class FileDownloader(ABC): + _type: ClassVar[str] = "generic" + + @abstractmethod + def __init__(self, repo: str, errors: Literal["continue", "raise"] = "continue") -> None: + pass + + @abstractmethod + def copy(self, source: str, destination: Path) -> None: + pass + + +class GitHubFileDownloader(FileDownloader): + _type: ClassVar[str] = "github" + api_url = "https://api.github.com" def __init__(self, repo: str, errors: Literal["continue", "raise"] = "continue") -> None: @@ -70,3 +86,29 @@ def _download_file(self, url: str, source: Path, destination: Path) -> None: destination_path = destination / source.name destination_path.parent.mkdir(parents=True, exist_ok=True) destination_path.write_bytes(response.content) + + +class HttpFileDownloader(FileDownloader): + _type: ClassVar[str] = "http" + + def __init__(self, repo: str, errors: Literal["continue", "raise"] = "raise") -> None: + self.repo = repo + self.errors = errors + + def copy(self, source: str, destination: Path) -> None: + if not self.repo.endswith("/"): + self.repo += "/" + + sources = source.split(";") if ";" in source else [source] + for source in sources: + file_url = urljoin(self.repo, source) + + response = requests.get(file_url) + if response.status_code >= 400: + if self.errors == "raise": + response.raise_for_status() + print(HTTPWarning("GET", response.text, response.status_code).get_message()) + continue + + location = destination if not destination.is_dir() else destination / Path(source).name + location.write_bytes(response.content)