From 49152bd07121d645ee405ba2f480118f4d6d5b5d Mon Sep 17 00:00:00 2001 From: Marcel Levstek <62072754+marcellevstek@users.noreply.github.com> Date: Mon, 30 Sep 2024 12:03:29 +0200 Subject: [PATCH] Add functionality for setting custom names of downloaded files --- docs/CHANGELOG.rst | 9 ++++++++ src/resdk/resolwe.py | 37 +++++++++++++++++++++++++------ src/resdk/resources/collection.py | 2 +- src/resdk/resources/data.py | 22 ++++++++++++++++-- tests/unit/test_collection.py | 8 +++++-- tests/unit/test_data.py | 8 +++++-- tests/unit/test_resolwe.py | 10 +++++---- 7 files changed, 78 insertions(+), 18 deletions(-) diff --git a/docs/CHANGELOG.rst b/docs/CHANGELOG.rst index cf8b2f16..777a4f7d 100644 --- a/docs/CHANGELOG.rst +++ b/docs/CHANGELOG.rst @@ -4,6 +4,15 @@ Change Log All notable changes to this project are documented in this file. +=================== +21.3.0 - 2024-09-30 +=================== + +Added +----- +- Add an argument for custom file naming of downloaded files + and propagate this change in ``Data`` resource + =================== 21.2.0 - 2024-07-10 diff --git a/src/resdk/resolwe.py b/src/resdk/resolwe.py index 63268dc1..b9ad7bdd 100644 --- a/src/resdk/resolwe.py +++ b/src/resdk/resolwe.py @@ -462,7 +462,12 @@ def get_or_run(self, slug=None, input={}): return Data(resolwe=self, **model_data) def _download_files( - self, files: List[Union[str, Path]], download_dir=None, show_progress=True + self, + files: List[Union[str, Path]], + download_dir=None, + show_progress=True, + custom_file_names: Union[List[str], None] = None, + skip_existing=False, ): """Download files. @@ -473,6 +478,8 @@ def _download_files( :type files: list of file URI :param download_dir: download directory :type download_dir: string + :param custom_file_names: list of file names to save the downloaded files as + :type custom_file_names: list of strings or None :rtype: None """ @@ -484,9 +491,16 @@ def _download_files( "Download directory does not exist: {}".format(download_dir) ) + if not custom_file_names: + custom_file_names = len(files) * [None] + else: + if not len(files) == len(custom_file_names): + raise ValueError( + "Number of files and their corresponding custom file names must be equal." + ) + if not files: self.logger.info("No files to download.") - else: self.logger.info("Downloading files to %s:", download_dir) # Store the sizes of files in the given directory. @@ -494,7 +508,7 @@ def _download_files( sizes: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) checksums: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) - for file_uri in files: + for file_uri, custom_file_name in zip(files, custom_file_names): file_name = os.path.basename(file_uri) file_path = os.path.dirname(file_uri) file_url = urljoin(self.url, "data/{}".format(file_uri)) @@ -518,12 +532,19 @@ def _download_files( file_size = sizes[file_directory][file_name] + if custom_file_name: + desc = f"Downloading file {file_name} as {custom_file_name}" + actual_file_name = custom_file_name + else: + desc = f"Downloading file {file_name}" + actual_file_name = file_name + with tqdm.tqdm( total=file_size, disable=not show_progress, - desc=f"Downloading file {file_name}", + desc=desc, ) as progress_bar, open( - os.path.join(download_dir, file_path, file_name), "wb" + os.path.join(download_dir, file_path, actual_file_name), "wb" ) as file_handle: response = self.session.get(file_url, stream=True, auth=self.auth) @@ -540,10 +561,12 @@ def _download_files( # checksums that are difficult to reproduce here. return expected_md5 = checksums[file_directory][file_name] - computed_md5 = md5(os.path.join(download_dir, file_path, file_name)) + computed_md5 = md5( + os.path.join(download_dir, file_path, actual_file_name) + ) if expected_md5 != computed_md5: raise ValueError( - f"Checksum of downloaded file {file_name} does not match the expected value." + f"Checksum of downloaded file {actual_file_name} does not match the expected value." ) def data_usage(self, **query_params): diff --git a/src/resdk/resources/collection.py b/src/resdk/resources/collection.py index e6fa19f7..a8e19598 100644 --- a/src/resdk/resources/collection.py +++ b/src/resdk/resources/collection.py @@ -140,7 +140,7 @@ def download(self, file_name=None, field_name=None, download_dir=None): data_files = data.files(file_name, field_name) files.extend("{}/{}".format(data.id, file_name) for file_name in data_files) - self.resolwe._download_files(files, download_dir) + self.resolwe._download_files(files=files, download_dir=download_dir) class Collection(CollectionRelationsMixin, BaseCollection): diff --git a/src/resdk/resources/data.py b/src/resdk/resources/data.py index adabde77..bb5e5f00 100644 --- a/src/resdk/resources/data.py +++ b/src/resdk/resources/data.py @@ -324,7 +324,9 @@ def files(self, file_name=None, field_name=None): return file_list - def download(self, file_name=None, field_name=None, download_dir=None): + def download( + self, file_name=None, field_name=None, download_dir=None, custom_file_name=None + ): """Download Data object's files and directories. Download files and directories from the Resolwe server to the @@ -336,6 +338,8 @@ def download(self, file_name=None, field_name=None, download_dir=None): :type field_name: string :param download_dir: download path :type download_dir: string + :param custom_file_name: custom file name + :type custom_file_name: string :rtype: None Data objects can contain multiple files and directories. All are @@ -353,7 +357,21 @@ def download(self, file_name=None, field_name=None, download_dir=None): "{}/{}".format(self.id, fname) for fname in self.files(file_name, field_name) ] - self.resolwe._download_files(files, download_dir) + + # Only applies if downloading a single file + custom_file_names = None + if custom_file_name: + if file_name or field_name: + custom_file_names = [custom_file_name] * len(files) + else: + logging.warning( + "Setting a custom file name is not supported " + "without specifying file name or field name." + ) + + self.resolwe._download_files( + files=files, download_dir=download_dir, custom_file_names=custom_file_names + ) def stdout(self): """Return process standard output (stdout.txt file content). diff --git a/tests/unit/test_collection.py b/tests/unit/test_collection.py index 7017d871..df6afa1d 100644 --- a/tests/unit/test_collection.py +++ b/tests/unit/test_collection.py @@ -45,14 +45,18 @@ def test_field_name(self, collection_mock): collection_mock.configure_mock(data=[DATA0, DATA2], resolwe=MagicMock()) BaseCollection.download(collection_mock, field_name="output.exp") flist = ["2/outfile.exp"] - collection_mock.resolwe._download_files.assert_called_once_with(flist, None) + collection_mock.resolwe._download_files.assert_called_once_with( + files=flist, download_dir=None + ) # Check if ``output_field`` does not start with 'output' collection_mock.reset_mock() collection_mock.configure_mock(data=[DATA1, DATA0], resolwe=MagicMock()) BaseCollection.download(collection_mock, field_name="fastq") flist = ["1/reads.fq", "1/arch.gz"] - collection_mock.resolwe._download_files.assert_called_once_with(flist, None) + collection_mock.resolwe._download_files.assert_called_once_with( + files=flist, download_dir=None + ) def test_bad_field_name(self): collection = Collection(resolwe=MagicMock(), id=1) diff --git a/tests/unit/test_data.py b/tests/unit/test_data.py index 57f75b7d..dd14a38d 100644 --- a/tests/unit/test_data.py +++ b/tests/unit/test_data.py @@ -178,13 +178,17 @@ def test_download_ok(self, data_mock): Data.download(data_mock) data_mock.resolwe._download_files.assert_called_once_with( - ["123/file1.txt", "123/file2.fq.gz"], None + files=["123/file1.txt", "123/file2.fq.gz"], + download_dir=None, + custom_file_names=None, ) data_mock.reset_mock() Data.download(data_mock, download_dir="/some/path/") data_mock.resolwe._download_files.assert_called_once_with( - ["123/file1.txt", "123/file2.fq.gz"], "/some/path/" + files=["123/file1.txt", "123/file2.fq.gz"], + download_dir="/some/path/", + custom_file_names=None, ) @patch("resdk.resolwe.Resolwe") diff --git a/tests/unit/test_resolwe.py b/tests/unit/test_resolwe.py index 45d2cce4..1007aaf1 100644 --- a/tests/unit/test_resolwe.py +++ b/tests/unit/test_resolwe.py @@ -451,13 +451,15 @@ def test_fail_if_bad_dir(self, resolwe_mock): message = "Download directory does not exist: .*" with self.assertRaisesRegex(ValueError, message): - Resolwe._download_files(resolwe_mock, self.file_list, "/does/not/exist/") + Resolwe._download_files( + resolwe_mock, files=self.file_list, download_dir="/does/not/exist/" + ) @patch("resdk.resolwe.Resolwe", spec=True) def test_empty_file_list(self, resolwe_mock): resolwe_mock.configure_mock(**self.config) - Resolwe._download_files(resolwe_mock, [], download_dir=self.tmp_dir) + Resolwe._download_files(resolwe_mock, files=[], download_dir=self.tmp_dir) resolwe_mock.logger.info.assert_called_once_with("No files to download.") @@ -474,7 +476,7 @@ def test_bad_response(self, resolwe_mock): with self.assertRaisesRegex(Exception, "abc"): Resolwe._download_files( resolwe_mock, - self.file_list[:1], + files=self.file_list[:1], download_dir=self.tmp_dir, ) self.assertEqual(resolwe_mock.logger.info.call_count, 2) @@ -508,7 +510,7 @@ def test_good_response(self, resolwe_mock): Resolwe._download_files( resolwe_mock, - self.file_list, + files=self.file_list, download_dir=self.tmp_dir, show_progress=False, )