Skip to content

Commit

Permalink
Add amd gpgpu tests (New) (#1531)
Browse files Browse the repository at this point in the history
* Add amd-gpu-setup script

* Add basic ROCm Validation Suite wrapper.

This wrapper runs RVS with the pre-defined module configuration found in
`data/rvs-*.conf`, and then validates that the module test was
successsful.

* Log rvs stderr output

* Add NVIDIA vendor check & label for existing jobs.

* Add AMD GPU jobs to test plans

* Update gitignore

* Add config-dir option to rvs.py

* Add gpgpu rvs.py coverage tests

* Fix rvs.py backwards compatibility

Removed typing.override references.

* Fix rvs.py unhashable type argparse error.

* Fix rvs.py unhashable type error.

This time for real?

* Remove f-strings from rvs.py

* Fix rvs.py subprocess calls for python < 3.7

* Add rvs.py missing coverage test

* Use %s for logging messages.

* Move `rvs-iet` to stress test plan.

* Only take one module as a parameter

* Add user to render group

* Simplify `rvs.py`

- Use `SystemExit` instead of unused return values.
- Don't use `re` where not needed
- Use `_validate_output` instead of overriding `_run`.

* Fix `rvs.py` coverage tests.

- Added some fail descriptors
- Remove unnecessary check in `parse_args` since I added the `choices` parameter

* Change `--config` parameter of `rvs.py`

Since the script only runs one module at a time, it makes more sense to take the path to the config file directly.

* Format test_rvs.py

* Fix NVIDIA GPU vendor

* Remove unnecessary which_rvs

* Simplify ModuleRunner classes

* Fix typo

* Update coverage tests
  • Loading branch information
pedro-avalos authored Oct 23, 2024
1 parent f607e2b commit fc12ed5
Show file tree
Hide file tree
Showing 14 changed files with 710 additions and 7 deletions.
7 changes: 7 additions & 0 deletions providers/gpgpu/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# python
*.egg-info
*.pyc
.coverage
MANIFEST
__pycache__

# packaging folders
build
dist
Expand Down
205 changes: 205 additions & 0 deletions providers/gpgpu/bin/rvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#!/usr/bin/env python3
"""ROCm Validation Suite wrapper.
Copyright (C) 2024 Canonical Ltd.
Authors
Pedro Avalos Jimenez <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License version 3,
as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
The purpose of this script is to simply wrap the ROCm Validation Suite
executable, adding an appropriate failure exit code when a test fails.
"""

import argparse
import logging
import os
import re
import shutil
import subprocess
from pathlib import Path

RVS_BIN = Path("/opt/rocm/bin/rvs")
"""Default location for ROCm Validation Suite binary."""

PLAINBOX_PROVIDER_DATA = Path(os.getenv("PLAINBOX_PROVIDER_DATA", "."))
"""Location of the RVS module configurations."""


class ModuleRunner:
"""This class represents the base module runner."""

def __init__(self, rvs: Path, config: Path) -> None:
"""Initializes the module runner."""
self.rvs = rvs
self.config = config

def run(self, module: str):
"""Runs and validates the RVS module.
Returns: 0 on success, nonzero on failure.
Raises: RuntimeError if process failed execution.
"""
logging.debug("%s: RUNNING", module)

result = subprocess.run(
[self.rvs, "-c", self.config],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
check=False,
)

if result.returncode != 0:
logging.error(result.stdout)
logging.error(result.stderr)
raise SystemExit("%s: FAILURE - failed to execute" % module)

# Log the error output as debug
if result.stderr:
logging.debug(result.stderr)

if hasattr(self, "_validate_output"):
if not self._validate_output(result.stdout, module):
logging.error(result.stdout)
raise SystemExit("%s: FAILURE - validation failed" % module)

logging.info(result.stdout)
logging.debug("%s: SUCCESS", module)


class PassFailModuleRunner(ModuleRunner):
"""This class represents a module runner that passes or fails."""

def _validate_output(self, output: str, module: str):
# Identify a successful module run
pass_strs = [
r"%s true" % re.escape(module),
r"pass: TRUE",
r"GFLOPS \d+ Target GFLOPS: \d+ met: TRUE",
]

# Find any of the common success messages in stdout
for line in output.splitlines():
if any(re.search(pass_str, line) for pass_str in pass_strs):
return True

return False


class MemModuleRunner(ModuleRunner):
"""This class represents the memory test module runner."""

def _validate_output(self, output: str, module: str):
# Check that every memory test passed
for test in range(1, 12):
if any(
"mem Test %s : PASS" % test in line
for line in output.splitlines()
):
continue
return False

return True


RVS_MODULES = {
"gpup": ModuleRunner,
"peqt": PassFailModuleRunner,
"pebb": ModuleRunner,
"pbqt": ModuleRunner,
"iet": PassFailModuleRunner,
"babel": ModuleRunner,
"mem": MemModuleRunner,
"gst": PassFailModuleRunner,
}
"""Mapping of module to corresponding runner to use for it."""


def parse_args():
"""Parses command-line arguments."""
parser = argparse.ArgumentParser(
description="ROCm Validation Suite wrapper"
)
parser.add_argument(
"module",
metavar="MOD",
nargs="?",
type=str,
choices=RVS_MODULES.keys(),
help="RVS module to run [{}]".format(", ".join(RVS_MODULES.keys())),
)
parser.add_argument(
"-l",
"--list-modules",
action="store_true",
help="List supported RVS modules",
)
parser.add_argument(
"-v",
"--verbose",
dest="log_level",
action="store_const",
const=logging.DEBUG,
default=logging.INFO,
help="Increase logging verbosity",
)
parser.add_argument(
"--rvs",
metavar="EXE",
type=Path,
default=Path(shutil.which("rvs") or RVS_BIN),
help="Path to RVS binary",
)
parser.add_argument(
"-c",
"--config",
metavar="PATH",
type=Path,
default=None,
help="Path to RVS module configuration",
)

args = parser.parse_args()

if args.list_modules:
parser.exit(
message="Modules supported: {}\n".format(" ".join(RVS_MODULES))
)
elif not args.module:
parser.error("--list-modules or module required")

# Add default configuration if none is provided
if args.config is None:
args.config = PLAINBOX_PROVIDER_DATA / "rvs-{}.conf".format(
args.module
)

return args


def main():
"""Main entrypoint of the program."""
args = parse_args()
logging.basicConfig(level=args.log_level)

logging.debug("Module to run: %s", args.module)
runner = RVS_MODULES[args.module](args.rvs, args.config)
runner.run(args.module)

logging.info("Result: PASS")


if __name__ == "__main__":
main()
16 changes: 16 additions & 0 deletions providers/gpgpu/data/rvs-babel.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# -*- mode: yaml -*-

# Tests memory transfer rates using BabelStream
actions:
- name: mem_transfer
module: babel
device: all
parallel: true
num_iter: 5000
array_size: 33554432
test_type: 2
mibibytes: false
o/p_csv: false
subtest: 1

# vim: ft=yaml
13 changes: 13 additions & 0 deletions providers/gpgpu/data/rvs-gpup.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# -*- mode: yaml -*-

# Displays the GPUs found and their properties
actions:
- name: query
module: gpup
device: all
properties:
all:
io_links-properties:
all:

# vim: ft=yaml
25 changes: 25 additions & 0 deletions providers/gpgpu/data/rvs-gst.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# -*- mode: yaml -*-

# duration (ms): 4 hours
# target_stress (GFLOPS): minimum stress to maintain
# ramp_interval (ms): initial time to wait for target_stress to be reached
# ops_type: Single-precision General Matrix Multiplication test
actions:
- name: stress-4hrs
module: gst
device: all
parallel: true
count: 1
duration: 14400000
ramp_interval: 300000
log_interval: 6000
target_stress: 8000
max_violations: 1
copy_matrix: false
tolerance: 0.01
matrix_size_a: 8640
matrix_size_b: 8640
matrix_size_c: 8640
ops_type: sgemm

# vim: ft=yaml
19 changes: 19 additions & 0 deletions providers/gpgpu/data/rvs-iet.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- mode: yaml -*-

# Tests peak power capabilities of a GPU
actions:
- name: power_capabilities
module: iet
device: all
wait: 100
duration: 50000
ramp_interval: 5000
sample_interval: 700
log_interval: 700
max_violations: 1
target_power: 140
tolerance: 0.1
matrix_size: 8640
ops_type: dgemm

# vim: ft=yaml
19 changes: 19 additions & 0 deletions providers/gpgpu/data/rvs-mem.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- mode: yaml -*-

# Tests the GPU memory for hardware errors and soft errors
# Tests excluded: 9) Modulo 20, random patters; 10) Bit fade test, 90 min, 2 patterns
actions:
- name: mem_test
device: all
module: mem
parallel: true
wait: 100
mapped_memory: false
mem_blocks: 128
num_passes: 500
thrds_per_blk: 64
stress: true
num_iter: 50000
exclude : 9 10

# vim: ft=yaml
14 changes: 14 additions & 0 deletions providers/gpgpu/data/rvs-pbqt.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- mode: yaml -*-

# Tests for P2P compatibility and performs bandwidth tests
actions:
- name: p2p_compat
device: all
module: pbqt
log_interval: 800
duration: 5000
peers: all
test_bandwidth: true
bidirectional: true

# vim: ft=yaml
14 changes: 14 additions & 0 deletions providers/gpgpu/data/rvs-pebb.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- mode: yaml -*-

# Tests PCIe bandwidth
actions:
- name: pci_bandwidth
module: pebb
device: all
log_interval: 800
duration: 2000
device_to_host: false
host_to_device: true
parallel: false

# vim: ft=yaml
28 changes: 28 additions & 0 deletions providers/gpgpu/data/rvs-peqt.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# -*- mode: yaml -*-

# Ensures the PCIe configuration is compatible with ROCm
actions:
- name: pci_config
module: peqt
device: all
capability:
link_cap_max_speed:
link_cap_max_width:
link_stat_cur_speed:
link_stat_neg_width:
slot_pwr_limit_value:
slot_physical_num:
device_id:
vendor_id:
kernel_driver:
dev_serial_num:
D0_Maximum_Power_12V:
D0_Maximum_Power_3_3V:
D0_Sustained_Power_12V:
D0_Sustained_Power_3_3V:
atomic_op_routing:
atomic_op_32_completer:
atomic_op_64_completer:
atomic_op_128_CAS_completer:

# vim: ft=yaml
Loading

0 comments on commit fc12ed5

Please sign in to comment.