Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-17002 test: fix rebuild_fio aggregation detection #15817

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 19 additions & 23 deletions src/tests/ftest/erasurecode/rebuild_fio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
'''
(C) Copyright 2019-2024 Intel Corporation.
(C) Copyright 2025 Hewlett Packard Enterprise Development LP

SPDX-License-Identifier: BSD-2-Clause-Patent
'''
Expand All @@ -24,13 +25,11 @@ def execution(self, rebuild_mode):
Args:
rebuild_mode (str): On-line or off-line rebuild mode
"""
aggregation_threshold = self.params.get("threshold", "/run/pool/aggregation/*")
aggregation_timeout = self.params.get("aggr_timeout", "/run/pool/aggregation/*")
read_option = self.params.get("rw_read", "/run/fio/test/read_write/*")

engine_count = self.server_managers[0].get_config_value("engines_per_host")
server_count = len(self.hostlist_servers) * engine_count
rank_to_kill = server_count - 1
num_ranks = len(self.server_managers[0].ranks)
rank_to_kill = num_ranks - 1

# 1. Disable aggregation
self.log_step("Disable aggregation")
Expand All @@ -54,33 +53,30 @@ def execution(self, rebuild_mode):

# Get initial total free space (scm+nvme)
self.log_step("Get initial total free space (scm+nvme)")
init_free_space = pool.get_total_free_space(refresh=True)
initial_free_space = pool.get_total_free_space(refresh=True)

# Enable aggregation
self.log_step("Enable aggregation")
pool.enable_aggregation()

# Get total space consumed (scm+nvme) after aggregation enabled, verify and wait until
# aggregation triggered, maximum 3 minutes.
# Wait for aggregation to be triggered.
# Assume an increase in total free space means aggregation is triggered.
self.log_step("Verify the Fio write finish without any error")
start_time = time.time()
timed_out = False
aggr_triggered = False
self.log_step("Verify and wait until aggregation triggered")
while not aggr_triggered and not timed_out:
# Check if current free space exceeds threshold
free_space = pool.get_total_free_space(refresh=True)
difference = free_space - init_free_space
aggr_triggered = difference >= aggregation_threshold
self.log.debug("Total Free space: initial=%s, current=%s, difference=%s",
"{:,}".format(init_free_space), "{:,}".format(free_space),
"{:,}".format(difference))
while True:
# Check if current free space exceeds initial free space
current_free_space = pool.get_total_free_space(refresh=True)
self.log.debug(
"Total Free space: initial=%s, current=%s",
"{:,}".format(initial_free_space), "{:,}".format(current_free_space))
if current_free_space > initial_free_space:
break
# Check timeout
timed_out = (time.time() - start_time) > aggregation_timeout
if not aggr_triggered and not timed_out:
time.sleep(1)
if timed_out:
self.fail(f"Aggregation not observed within {aggregation_timeout} seconds")
if (time.time() - start_time) > aggregation_timeout:
self.fail(f"Aggregation not observed within {aggregation_timeout} seconds")
self.log.debug("Rechecking in 5 seconds")
time.sleep(5)

# ec off-line rebuild fio
if 'off-line' in rebuild_mode:
Expand All @@ -100,7 +96,7 @@ def execution(self, rebuild_mode):
# If RF is 2 kill one more server and validate the data is not corrupted.
if int(container.properties.value.split(":")[1]) == 2:
# Kill one more server rank
rank_to_kill = server_count - 2
rank_to_kill = num_ranks - 2
self.log_step(f"Kill one more server rank {rank_to_kill} when RF=2")
self.fio_cmd._jobs['test'].unlink.value = 1 # pylint: disable=protected-access
self.server_managers[0].stop_ranks([rank_to_kill], self.d_log, force=True)
Expand Down
1 change: 0 additions & 1 deletion src/tests/ftest/erasurecode/rebuild_fio.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ server_config:
pool:
size: 60%
aggregation:
threshold: 50000000
aggr_timeout: 180
set_logmasks: False
container:
Expand Down