Skip to content

Commit

Permalink
Disable apparent-size in du command of get_size_on_disk (#6702)
Browse files Browse the repository at this point in the history
In #6584 the `get_size_on_disk` method was added to `RemoteData`, which calls the `du` utility as the default way to obtain the size of an associated file/directory, using the `-s` (summary) and `--bytes` options. As discovered in #6696, however, the relevant tests started failing when the `ubuntu-latest` environment of GHA got updated to point to Ubuntu 24.04 instead of 22.04. This is because, as it turns out, `--bytes` does not only give the output in bytes, but is actually equivalent to `--apparent-size`, as well as `--block-size=1`, and the behavior of the `--apparent-size` option of `du` has changed between the different Ubuntu versions. Thus, the command is now changed from `du -s --bytes` to `du -s --block-size=1`, as to still obtain the total size in bytes, but make the results more reliable and less fragile by not reporting the apparent size. The size results for the various parametrized test cases have also been updated accordingly.
  • Loading branch information
GeigerJ2 authored Jan 13, 2025
1 parent 80c1741 commit 2da3f96
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 16 deletions.
16 changes: 15 additions & 1 deletion src/aiida/orm/nodes/data/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,21 @@ def _get_size_on_disk_du(self, full_path: Path, transport: Transport) -> int:
"""

try:
retval, stdout, stderr = transport.exec_command_wait(f'du -s --bytes {full_path}')
# Initially, we were using the `--bytes` option here. However, this is equivalent to `--apparent-size` and
# `--block-size=1`, with the `--apparent-size` option being rather fragile (e.g., its implementation changed
# between the different versions of `du` of Ubuntu 22.04 and 24.04, causing the tests to fail). Thus, we
# retain only `--block-size=1` to obtain the total size in **bytes** as an integer value. Note that
# `--block-size` here refers to the size of "blocks" for displaying the output of `du`, by default 1024
# bytes (1 KiB), **not** the "disk block size".

# On the other hand, for the parametrized tests of this functionality in `test_remote.py`, a **disk block
# size** of 4096 bytes (4 KiB) is assumed, meaning that a file with, e.g., only 1 or 10 bytes of actual
# content (its `apparent-size`) still occupies 4096 bytes (4 KiB) on disk. In principle, the disk block size
# can differ between different disk formattings, OS, etc., however, as 4096 bytes (4 KiB) is the default for
# Linux's ext4, as well as macOS, we assume for now that it is a reasonable assumption and default value.

# Lastly, the `-s` (`--summarize`) option yields only a single total file size value.
retval, stdout, stderr = transport.exec_command_wait(f'du -s --block-size=1 {full_path}')
except NotImplementedError as exc:
raise NotImplementedError('`exec_command_wait` not implemented for the current transport plugin.') from exc

Expand Down
29 changes: 14 additions & 15 deletions tests/orm/nodes/data/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def test_clean(remote_data_factory, mode):
@pytest.mark.parametrize(
'setup, results',
(
(('du', False), ('4.01 KB', 'du')),
(('du', True), (4108, 'du')),
(('du', False), ('8.00 KB', 'du')),
(('du', True), (8192, 'du')),
(('stat', False), ('12.00 B', 'stat')),
(('stat', True), (12, 'stat')),
),
Expand All @@ -74,13 +74,12 @@ def test_get_size_on_disk_params(remote_data_factory, mode, setup, results):
@pytest.mark.parametrize(
'content, sizes',
(
(b'a', {'du': 4097, 'stat': 1, 'human': '4.00 KB'}),
(10 * b'a', {'du': 4106, 'stat': 10, 'human': '4.01 KB'}),
(100 * b'a', {'du': 4196, 'stat': 100, 'human': '4.10 KB'}),
(1000 * b'a', {'du': 5096, 'stat': 1000, 'human': '4.98 KB'}),
(1000000 * b'a', {'du': 1004096, 'stat': int(1e6), 'human': '980.56 KB'}),
(b'a', {'du': 8192, 'stat': 1, 'human': '8.00 KB'}),
(10 * b'a', {'du': 8192, 'stat': 10, 'human': '8.00 KB'}),
(1000 * b'a', {'du': 8192, 'stat': 1000, 'human': '8.00 KB'}),
(1000000 * b'a', {'du': 1007616, 'stat': int(1e6), 'human': '984.00 KB'}),
),
ids=['1-byte', '10-bytes', '100-bytes', '1000-bytes', '1e6-bytes'],
ids=['1-byte', '10-bytes', '1000-bytes', '1e6-bytes'],
)
def test_get_size_on_disk_sizes(remote_data_factory, mode, content, sizes):
"""Test the different implementations to obtain the size of a ``RemoteData`` on disk."""
Expand All @@ -103,12 +102,12 @@ def test_get_size_on_disk_sizes(remote_data_factory, mode, content, sizes):
@pytest.mark.parametrize(
'num_char, relpath, sizes',
(
(1, '.', {'du': 12291, 'stat': 8195, 'human': '12.00 KB'}),
(100, '.', {'du': 12588, 'stat': 8492, 'human': '12.29 KB'}),
(int(1e6), '.', {'du': 3012288, 'stat': 3008192, 'human': '2.87 MB'}),
(1, 'subdir1', {'du': 8194, 'stat': 4098, 'human': '8.00 KB'}),
(100, 'subdir1', {'du': 8392, 'stat': 4296, 'human': '8.20 KB'}),
(int(1e6), 'subdir1', {'du': 2008192, 'stat': 2004096, 'human': '1.92 MB'}),
(1, '.', {'du': 24576, 'stat': 8195, 'human': '24.00 KB'}),
(100, '.', {'du': 24576, 'stat': 8492, 'human': '24.00 KB'}),
(int(1e6), '.', {'du': 3022848, 'stat': 3008192, 'human': '2.88 MB'}),
(1, 'subdir1', {'du': 16384, 'stat': 4098, 'human': '16.00 KB'}),
(100, 'subdir1', {'du': 16384, 'stat': 4296, 'human': '16.00 KB'}),
(int(1e6), 'subdir1', {'du': 2015232, 'stat': 2004096, 'human': '1.92 MB'}),
),
)
def test_get_size_on_disk_nested(aiida_localhost, tmp_path, num_char, relpath, sizes):
Expand Down Expand Up @@ -173,7 +172,7 @@ def test_get_size_on_disk_du(remote_data_factory, mode, monkeypatch):

with authinfo.get_transport() as transport:
size_on_disk = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path)
assert size_on_disk == 4108
assert size_on_disk == 8192

# Monkeypatch transport exec_command_wait command to simulate it not being implemented, e.g., for FirecREST plugin
def mock_exec_command_wait(command):
Expand Down

0 comments on commit 2da3f96

Please sign in to comment.