-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add amd-gpu-setup script * Add basic ROCm Validation Suite wrapper. This wrapper runs RVS with the pre-defined module configuration found in `data/rvs-*.conf`, and then validates that the module test was successsful. * Log rvs stderr output * Add NVIDIA vendor check & label for existing jobs. * Add AMD GPU jobs to test plans * Update gitignore * Add config-dir option to rvs.py * Add gpgpu rvs.py coverage tests * Fix rvs.py backwards compatibility Removed typing.override references. * Fix rvs.py unhashable type argparse error. * Fix rvs.py unhashable type error. This time for real? * Remove f-strings from rvs.py * Fix rvs.py subprocess calls for python < 3.7 * Add rvs.py missing coverage test * Use %s for logging messages. * Move `rvs-iet` to stress test plan. * Only take one module as a parameter * Add user to render group * Simplify `rvs.py` - Use `SystemExit` instead of unused return values. - Don't use `re` where not needed - Use `_validate_output` instead of overriding `_run`. * Fix `rvs.py` coverage tests. - Added some fail descriptors - Remove unnecessary check in `parse_args` since I added the `choices` parameter * Change `--config` parameter of `rvs.py` Since the script only runs one module at a time, it makes more sense to take the path to the config file directly. * Format test_rvs.py * Fix NVIDIA GPU vendor * Remove unnecessary which_rvs * Simplify ModuleRunner classes * Fix typo * Update coverage tests
- Loading branch information
1 parent
f607e2b
commit fc12ed5
Showing
14 changed files
with
710 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,10 @@ | ||
# python | ||
*.egg-info | ||
*.pyc | ||
.coverage | ||
MANIFEST | ||
__pycache__ | ||
|
||
# packaging folders | ||
build | ||
dist | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
#!/usr/bin/env python3 | ||
"""ROCm Validation Suite wrapper. | ||
Copyright (C) 2024 Canonical Ltd. | ||
Authors | ||
Pedro Avalos Jimenez <[email protected]> | ||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License version 3, | ||
as published by the Free Software Foundation. | ||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
The purpose of this script is to simply wrap the ROCm Validation Suite | ||
executable, adding an appropriate failure exit code when a test fails. | ||
""" | ||
|
||
import argparse | ||
import logging | ||
import os | ||
import re | ||
import shutil | ||
import subprocess | ||
from pathlib import Path | ||
|
||
RVS_BIN = Path("/opt/rocm/bin/rvs") | ||
"""Default location for ROCm Validation Suite binary.""" | ||
|
||
PLAINBOX_PROVIDER_DATA = Path(os.getenv("PLAINBOX_PROVIDER_DATA", ".")) | ||
"""Location of the RVS module configurations.""" | ||
|
||
|
||
class ModuleRunner: | ||
"""This class represents the base module runner.""" | ||
|
||
def __init__(self, rvs: Path, config: Path) -> None: | ||
"""Initializes the module runner.""" | ||
self.rvs = rvs | ||
self.config = config | ||
|
||
def run(self, module: str): | ||
"""Runs and validates the RVS module. | ||
Returns: 0 on success, nonzero on failure. | ||
Raises: RuntimeError if process failed execution. | ||
""" | ||
logging.debug("%s: RUNNING", module) | ||
|
||
result = subprocess.run( | ||
[self.rvs, "-c", self.config], | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
universal_newlines=True, | ||
check=False, | ||
) | ||
|
||
if result.returncode != 0: | ||
logging.error(result.stdout) | ||
logging.error(result.stderr) | ||
raise SystemExit("%s: FAILURE - failed to execute" % module) | ||
|
||
# Log the error output as debug | ||
if result.stderr: | ||
logging.debug(result.stderr) | ||
|
||
if hasattr(self, "_validate_output"): | ||
if not self._validate_output(result.stdout, module): | ||
logging.error(result.stdout) | ||
raise SystemExit("%s: FAILURE - validation failed" % module) | ||
|
||
logging.info(result.stdout) | ||
logging.debug("%s: SUCCESS", module) | ||
|
||
|
||
class PassFailModuleRunner(ModuleRunner): | ||
"""This class represents a module runner that passes or fails.""" | ||
|
||
def _validate_output(self, output: str, module: str): | ||
# Identify a successful module run | ||
pass_strs = [ | ||
r"%s true" % re.escape(module), | ||
r"pass: TRUE", | ||
r"GFLOPS \d+ Target GFLOPS: \d+ met: TRUE", | ||
] | ||
|
||
# Find any of the common success messages in stdout | ||
for line in output.splitlines(): | ||
if any(re.search(pass_str, line) for pass_str in pass_strs): | ||
return True | ||
|
||
return False | ||
|
||
|
||
class MemModuleRunner(ModuleRunner): | ||
"""This class represents the memory test module runner.""" | ||
|
||
def _validate_output(self, output: str, module: str): | ||
# Check that every memory test passed | ||
for test in range(1, 12): | ||
if any( | ||
"mem Test %s : PASS" % test in line | ||
for line in output.splitlines() | ||
): | ||
continue | ||
return False | ||
|
||
return True | ||
|
||
|
||
RVS_MODULES = { | ||
"gpup": ModuleRunner, | ||
"peqt": PassFailModuleRunner, | ||
"pebb": ModuleRunner, | ||
"pbqt": ModuleRunner, | ||
"iet": PassFailModuleRunner, | ||
"babel": ModuleRunner, | ||
"mem": MemModuleRunner, | ||
"gst": PassFailModuleRunner, | ||
} | ||
"""Mapping of module to corresponding runner to use for it.""" | ||
|
||
|
||
def parse_args(): | ||
"""Parses command-line arguments.""" | ||
parser = argparse.ArgumentParser( | ||
description="ROCm Validation Suite wrapper" | ||
) | ||
parser.add_argument( | ||
"module", | ||
metavar="MOD", | ||
nargs="?", | ||
type=str, | ||
choices=RVS_MODULES.keys(), | ||
help="RVS module to run [{}]".format(", ".join(RVS_MODULES.keys())), | ||
) | ||
parser.add_argument( | ||
"-l", | ||
"--list-modules", | ||
action="store_true", | ||
help="List supported RVS modules", | ||
) | ||
parser.add_argument( | ||
"-v", | ||
"--verbose", | ||
dest="log_level", | ||
action="store_const", | ||
const=logging.DEBUG, | ||
default=logging.INFO, | ||
help="Increase logging verbosity", | ||
) | ||
parser.add_argument( | ||
"--rvs", | ||
metavar="EXE", | ||
type=Path, | ||
default=Path(shutil.which("rvs") or RVS_BIN), | ||
help="Path to RVS binary", | ||
) | ||
parser.add_argument( | ||
"-c", | ||
"--config", | ||
metavar="PATH", | ||
type=Path, | ||
default=None, | ||
help="Path to RVS module configuration", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
if args.list_modules: | ||
parser.exit( | ||
message="Modules supported: {}\n".format(" ".join(RVS_MODULES)) | ||
) | ||
elif not args.module: | ||
parser.error("--list-modules or module required") | ||
|
||
# Add default configuration if none is provided | ||
if args.config is None: | ||
args.config = PLAINBOX_PROVIDER_DATA / "rvs-{}.conf".format( | ||
args.module | ||
) | ||
|
||
return args | ||
|
||
|
||
def main(): | ||
"""Main entrypoint of the program.""" | ||
args = parse_args() | ||
logging.basicConfig(level=args.log_level) | ||
|
||
logging.debug("Module to run: %s", args.module) | ||
runner = RVS_MODULES[args.module](args.rvs, args.config) | ||
runner.run(args.module) | ||
|
||
logging.info("Result: PASS") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# Tests memory transfer rates using BabelStream | ||
actions: | ||
- name: mem_transfer | ||
module: babel | ||
device: all | ||
parallel: true | ||
num_iter: 5000 | ||
array_size: 33554432 | ||
test_type: 2 | ||
mibibytes: false | ||
o/p_csv: false | ||
subtest: 1 | ||
|
||
# vim: ft=yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# Displays the GPUs found and their properties | ||
actions: | ||
- name: query | ||
module: gpup | ||
device: all | ||
properties: | ||
all: | ||
io_links-properties: | ||
all: | ||
|
||
# vim: ft=yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# duration (ms): 4 hours | ||
# target_stress (GFLOPS): minimum stress to maintain | ||
# ramp_interval (ms): initial time to wait for target_stress to be reached | ||
# ops_type: Single-precision General Matrix Multiplication test | ||
actions: | ||
- name: stress-4hrs | ||
module: gst | ||
device: all | ||
parallel: true | ||
count: 1 | ||
duration: 14400000 | ||
ramp_interval: 300000 | ||
log_interval: 6000 | ||
target_stress: 8000 | ||
max_violations: 1 | ||
copy_matrix: false | ||
tolerance: 0.01 | ||
matrix_size_a: 8640 | ||
matrix_size_b: 8640 | ||
matrix_size_c: 8640 | ||
ops_type: sgemm | ||
|
||
# vim: ft=yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# Tests peak power capabilities of a GPU | ||
actions: | ||
- name: power_capabilities | ||
module: iet | ||
device: all | ||
wait: 100 | ||
duration: 50000 | ||
ramp_interval: 5000 | ||
sample_interval: 700 | ||
log_interval: 700 | ||
max_violations: 1 | ||
target_power: 140 | ||
tolerance: 0.1 | ||
matrix_size: 8640 | ||
ops_type: dgemm | ||
|
||
# vim: ft=yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# Tests the GPU memory for hardware errors and soft errors | ||
# Tests excluded: 9) Modulo 20, random patters; 10) Bit fade test, 90 min, 2 patterns | ||
actions: | ||
- name: mem_test | ||
device: all | ||
module: mem | ||
parallel: true | ||
wait: 100 | ||
mapped_memory: false | ||
mem_blocks: 128 | ||
num_passes: 500 | ||
thrds_per_blk: 64 | ||
stress: true | ||
num_iter: 50000 | ||
exclude : 9 10 | ||
|
||
# vim: ft=yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# Tests for P2P compatibility and performs bandwidth tests | ||
actions: | ||
- name: p2p_compat | ||
device: all | ||
module: pbqt | ||
log_interval: 800 | ||
duration: 5000 | ||
peers: all | ||
test_bandwidth: true | ||
bidirectional: true | ||
|
||
# vim: ft=yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# Tests PCIe bandwidth | ||
actions: | ||
- name: pci_bandwidth | ||
module: pebb | ||
device: all | ||
log_interval: 800 | ||
duration: 2000 | ||
device_to_host: false | ||
host_to_device: true | ||
parallel: false | ||
|
||
# vim: ft=yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# -*- mode: yaml -*- | ||
|
||
# Ensures the PCIe configuration is compatible with ROCm | ||
actions: | ||
- name: pci_config | ||
module: peqt | ||
device: all | ||
capability: | ||
link_cap_max_speed: | ||
link_cap_max_width: | ||
link_stat_cur_speed: | ||
link_stat_neg_width: | ||
slot_pwr_limit_value: | ||
slot_physical_num: | ||
device_id: | ||
vendor_id: | ||
kernel_driver: | ||
dev_serial_num: | ||
D0_Maximum_Power_12V: | ||
D0_Maximum_Power_3_3V: | ||
D0_Sustained_Power_12V: | ||
D0_Sustained_Power_3_3V: | ||
atomic_op_routing: | ||
atomic_op_32_completer: | ||
atomic_op_64_completer: | ||
atomic_op_128_CAS_completer: | ||
|
||
# vim: ft=yaml |
Oops, something went wrong.