diff --git a/src/aiida/orm/nodes/data/remote/base.py b/src/aiida/orm/nodes/data/remote/base.py index 5b7fec86c2..97f5f067b8 100644 --- a/src/aiida/orm/nodes/data/remote/base.py +++ b/src/aiida/orm/nodes/data/remote/base.py @@ -285,7 +285,21 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int: """ try: - retval, stdout, stderr = transport.exec_command_wait(f'du -s --bytes {full_path}') + # Initially, we were using the `--bytes` option here. However, this is equivalent to `--apparent-size` and + # `--block-size=1`, with the `--apparent-size` option being rather fragile (e.g., its implementation changed + # between the different versions of `du` of Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we + # retain only `--block-size=1` to obtain the total size in **bytes** as an integer value. Note that + # `--block-size` here refers to the size of "blocks" for displaying the output of `du`, by default 1024 + # bytes (1 KiB), **not** the "disk block size". + + # On the other hand, for the parametrized tests of this functionality in `test_remote.py`, a **disk block + # size** of 4096 bytes (4 KiB) is assumed, meaning that a file with, e.g., only 1 or 10 bytes of actual + # content (its `apparent-size`) still occupies 4096 bytes (4 KiB) on disk. In principle, the disk block size + # can differ between different disk formattings, OS, etc., however, as 4096 bytes (4 KiB) is the default for + # Linux's ext4, as well as macOS, we assume for now that it is a reasonable assumption and default value. + + # Lastly, the `-s` (`--summarize`) option yields only a single total file size value. + retval, stdout, stderr = transport.exec_command_wait(f'du -s --block-size=1 {full_path}') except NotImplementedError as exc: raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc diff --git a/tests/orm/nodes/data/test_remote.py b/tests/orm/nodes/data/test_remote.py index ad12d97273..52848be019 100644 --- a/tests/orm/nodes/data/test_remote.py +++ b/tests/orm/nodes/data/test_remote.py @@ -55,8 +55,8 @@ def test_clean(remote_data_factory, mode): @pytest.mark.parametrize( 'setup, results', ( - (('du', False), ('4.01 KB', 'du')), - (('du', True), (4108, 'du')), + (('du', False), ('8.00 KB', 'du')), + (('du', True), (8192, 'du')), (('stat', False), ('12.00 B', 'stat')), (('stat', True), (12, 'stat')), ), @@ -74,13 +74,12 @@ def test_get_size_on_disk_params(remote_data_factory, mode, setup, results): @pytest.mark.parametrize( 'content, sizes', ( - (b'a', {'du': 4097, 'stat': 1, 'human': '4.00 KB'}), - (10 * b'a', {'du': 4106, 'stat': 10, 'human': '4.01 KB'}), - (100 * b'a', {'du': 4196, 'stat': 100, 'human': '4.10 KB'}), - (1000 * b'a', {'du': 5096, 'stat': 1000, 'human': '4.98 KB'}), - (1000000 * b'a', {'du': 1004096, 'stat': int(1e6), 'human': '980.56 KB'}), + (b'a', {'du': 8192, 'stat': 1, 'human': '8.00 KB'}), + (10 * b'a', {'du': 8192, 'stat': 10, 'human': '8.00 KB'}), + (1000 * b'a', {'du': 8192, 'stat': 1000, 'human': '8.00 KB'}), + (1000000 * b'a', {'du': 1007616, 'stat': int(1e6), 'human': '984.00 KB'}), ), - ids=['1-byte', '10-bytes', '100-bytes', '1000-bytes', '1e6-bytes'], + ids=['1-byte', '10-bytes', '1000-bytes', '1e6-bytes'], ) def test_get_size_on_disk_sizes(remote_data_factory, mode, content, sizes): """Test the different implementations to obtain the size of a ``RemoteData`` on disk.""" @@ -103,12 +102,12 @@ def test_get_size_on_disk_sizes(remote_data_factory, mode, content, sizes): @pytest.mark.parametrize( 'num_char, relpath, sizes', ( - (1, '.', {'du': 12291, 'stat': 8195, 'human': '12.00 KB'}), - (100, '.', {'du': 12588, 'stat': 8492, 'human': '12.29 KB'}), - (int(1e6), '.', {'du': 3012288, 'stat': 3008192, 'human': '2.87 MB'}), - (1, 'subdir1', {'du': 8194, 'stat': 4098, 'human': '8.00 KB'}), - (100, 'subdir1', {'du': 8392, 'stat': 4296, 'human': '8.20 KB'}), - (int(1e6), 'subdir1', {'du': 2008192, 'stat': 2004096, 'human': '1.92 MB'}), + (1, '.', {'du': 24576, 'stat': 8195, 'human': '24.00 KB'}), + (100, '.', {'du': 24576, 'stat': 8492, 'human': '24.00 KB'}), + (int(1e6), '.', {'du': 3022848, 'stat': 3008192, 'human': '2.88 MB'}), + (1, 'subdir1', {'du': 16384, 'stat': 4098, 'human': '16.00 KB'}), + (100, 'subdir1', {'du': 16384, 'stat': 4296, 'human': '16.00 KB'}), + (int(1e6), 'subdir1', {'du': 2015232, 'stat': 2004096, 'human': '1.92 MB'}), ), ) def test_get_size_on_disk_nested(aiida_localhost, tmp_path, num_char, relpath, sizes): @@ -173,7 +172,7 @@ def test_get_size_on_disk_du(remote_data_factory, mode, monkeypatch): with authinfo.get_transport() as transport: size_on_disk = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path) - assert size_on_disk == 4108 + assert size_on_disk == 8192 # Monkeypatch transport exec_command_wait command to simulate it not being implemented, e.g., for FirecREST plugin def mock_exec_command_wait(command):