Skip to content

Commit

Permalink
DAOS-17002 test: fix rebuild_fio aggregation detection
Browse files Browse the repository at this point in the history
Simplify and fix aggregation detection in rebuild_fio
by simply checking for when the total free space increases.

Test-repeat: 3
Skip-unit-tests: true
Skip-fault-injection-test: true

Signed-off-by: Dalton Bohning <[email protected]>
  • Loading branch information
daltonbohning committed Feb 3, 2025
1 parent aee0149 commit 942cbb9
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 24 deletions.
42 changes: 19 additions & 23 deletions src/tests/ftest/erasurecode/rebuild_fio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2019-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP
SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand All @@ -24,13 +25,11 @@ def execution(self, rebuild_mode):
Args:
rebuild_mode (str): On-line or off-line rebuild mode
"""
aggregation_threshold = self.params.get("threshold", "/run/pool/aggregation/*")
aggregation_timeout = self.params.get("aggr_timeout", "/run/pool/aggregation/*")
read_option = self.params.get("rw_read", "/run/fio/test/read_write/*")

engine_count = self.server_managers[0].get_config_value("engines_per_host")
server_count = len(self.hostlist_servers) * engine_count
rank_to_kill = server_count - 1
num_ranks = len(self.server_managers[0].ranks)
rank_to_kill = num_ranks - 1

# 1. Disable aggregation
self.log_step("Disable aggregation")
Expand All @@ -54,33 +53,30 @@ def execution(self, rebuild_mode):

# Get initial total free space (scm+nvme)
self.log_step("Get initial total free space (scm+nvme)")
init_free_space = pool.get_total_free_space(refresh=True)
initial_free_space = pool.get_total_free_space(refresh=True)

# Enable aggregation
self.log_step("Enable aggregation")
pool.enable_aggregation()

# Get total space consumed (scm+nvme) after aggregation enabled, verify and wait until
# aggregation triggered, maximum 3 minutes.
# Wait for aggregation to be triggered.
# Assume an increase in total free space means aggregation is triggered.
self.log_step("Verify the Fio write finish without any error")
start_time = time.time()
timed_out = False
aggr_triggered = False
self.log_step("Verify and wait until aggregation triggered")
while not aggr_triggered and not timed_out:
# Check if current free space exceeds threshold
free_space = pool.get_total_free_space(refresh=True)
difference = free_space - init_free_space
aggr_triggered = difference >= aggregation_threshold
self.log.debug("Total Free space: initial=%s, current=%s, difference=%s",
"{:,}".format(init_free_space), "{:,}".format(free_space),
"{:,}".format(difference))
while True:
# Check if current free space exceeds initial free space
current_free_space = pool.get_total_free_space(refresh=True)
self.log.debug(
"Total Free space: initial=%s, current=%s",
"{:,}".format(initial_free_space), "{:,}".format(current_free_space))
if current_free_space > initial_free_space:
break
# Check timeout
timed_out = (time.time() - start_time) > aggregation_timeout
if not aggr_triggered and not timed_out:
time.sleep(1)
if timed_out:
self.fail(f"Aggregation not observed within {aggregation_timeout} seconds")
if (time.time() - start_time) > aggregation_timeout:
self.fail(f"Aggregation not observed within {aggregation_timeout} seconds")
self.log.debug("Rechecking in 5 seconds")
time.sleep(5)

# ec off-line rebuild fio
if 'off-line' in rebuild_mode:
Expand All @@ -100,7 +96,7 @@ def execution(self, rebuild_mode):
# If RF is 2 kill one more server and validate the data is not corrupted.
if int(container.properties.value.split(":")[1]) == 2:
# Kill one more server rank
rank_to_kill = server_count - 2
rank_to_kill = num_ranks - 2
self.log_step(f"Kill one more server rank {rank_to_kill} when RF=2")
self.fio_cmd._jobs['test'].unlink.value = 1 # pylint: disable=protected-access
self.server_managers[0].stop_ranks([rank_to_kill], self.d_log, force=True)
Expand Down
1 change: 0 additions & 1 deletion src/tests/ftest/erasurecode/rebuild_fio.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ server_config:
pool:
size: 60%
aggregation:
threshold: 50000000
aggr_timeout: 180
set_logmasks: False
container:
Expand Down

0 comments on commit 942cbb9

Please sign in to comment.