Add amd gpgpu tests (New) (#1531)

* Add amd-gpu-setup script * Add basic ROCm Validation Suite wrapper. This wrapper runs RVS with the pre-defined module configuration found in `data/rvs-*.conf`, and then validates that the module test was successsful. * Log rvs stderr output * Add NVIDIA vendor check & label for existing jobs. * Add AMD GPU jobs to test plans * Update gitignore * Add config-dir option to rvs.py * Add gpgpu rvs.py coverage tests * Fix rvs.py backwards compatibility Removed typing.override references. * Fix rvs.py unhashable type argparse error. * Fix rvs.py unhashable type error. This time for real? * Remove f-strings from rvs.py * Fix rvs.py subprocess calls for python < 3.7 * Add rvs.py missing coverage test * Use %s for logging messages. * Move `rvs-iet` to stress test plan. * Only take one module as a parameter * Add user to render group * Simplify `rvs.py` - Use `SystemExit` instead of unused return values. - Don't use `re` where not needed - Use `_validate_output` instead of overriding `_run`. * Fix `rvs.py` coverage tests. - Added some fail descriptors - Remove unnecessary check in `parse_args` since I added the `choices` parameter * Change `--config` parameter of `rvs.py` Since the script only runs one module at a time, it makes more sense to take the path to the config file directly. * Format test_rvs.py * Fix NVIDIA GPU vendor * Remove unnecessary which_rvs * Simplify ModuleRunner classes * Fix typo * Update coverage tests
canonical · Oct 23, 2024 · fc12ed5 · fc12ed5
1 parent f607e2b
commit fc12ed5
Show file tree

Hide file tree

Showing 14 changed files with 710 additions and 7 deletions.
diff --git a/providers/gpgpu/.gitignore b/providers/gpgpu/.gitignore
@@ -1,3 +1,10 @@
+# python
+*.egg-info
+*.pyc
+.coverage
+MANIFEST
+__pycache__
+
 # packaging folders
 build
 dist

diff --git a/providers/gpgpu/bin/rvs.py b/providers/gpgpu/bin/rvs.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""ROCm Validation Suite wrapper.
+
+Copyright (C) 2024 Canonical Ltd.
+
+Authors
+  Pedro Avalos Jimenez <[email protected]>
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License version 3,
+as published by the Free Software Foundation.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+The purpose of this script is to simply wrap the ROCm Validation Suite
+executable, adding an appropriate failure exit code when a test fails.
+"""
+
+import argparse
+import logging
+import os
+import re
+import shutil
+import subprocess
+from pathlib import Path
+
+RVS_BIN = Path("/opt/rocm/bin/rvs")
+"""Default location for ROCm Validation Suite binary."""
+
+PLAINBOX_PROVIDER_DATA = Path(os.getenv("PLAINBOX_PROVIDER_DATA", "."))
+"""Location of the RVS module configurations."""
+
+
+class ModuleRunner:
+    """This class represents the base module runner."""
+
+    def __init__(self, rvs: Path, config: Path) -> None:
+        """Initializes the module runner."""
+        self.rvs = rvs
+        self.config = config
+
+    def run(self, module: str):
+        """Runs and validates the RVS module.
+
+        Returns: 0 on success, nonzero on failure.
+        Raises: RuntimeError if process failed execution.
+        """
+        logging.debug("%s: RUNNING", module)
+
+        result = subprocess.run(
+            [self.rvs, "-c", self.config],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            universal_newlines=True,
+            check=False,
+        )
+
+        if result.returncode != 0:
+            logging.error(result.stdout)
+            logging.error(result.stderr)
+            raise SystemExit("%s: FAILURE - failed to execute" % module)
+
+        # Log the error output as debug
+        if result.stderr:
+            logging.debug(result.stderr)
+
+        if hasattr(self, "_validate_output"):
+            if not self._validate_output(result.stdout, module):
+                logging.error(result.stdout)
+                raise SystemExit("%s: FAILURE - validation failed" % module)
+
+        logging.info(result.stdout)
+        logging.debug("%s: SUCCESS", module)
+
+
+class PassFailModuleRunner(ModuleRunner):
+    """This class represents a module runner that passes or fails."""
+
+    def _validate_output(self, output: str, module: str):
+        # Identify a successful module run
+        pass_strs = [
+            r"%s true" % re.escape(module),
+            r"pass: TRUE",
+            r"GFLOPS \d+ Target GFLOPS: \d+ met: TRUE",
+        ]
+
+        # Find any of the common success messages in stdout
+        for line in output.splitlines():
+            if any(re.search(pass_str, line) for pass_str in pass_strs):
+                return True
+
+        return False
+
+
+class MemModuleRunner(ModuleRunner):
+    """This class represents the memory test module runner."""
+
+    def _validate_output(self, output: str, module: str):
+        # Check that every memory test passed
+        for test in range(1, 12):
+            if any(
+                "mem Test %s : PASS" % test in line
+                for line in output.splitlines()
+            ):
+                continue
+            return False
+
+        return True
+
+
+RVS_MODULES = {
+    "gpup": ModuleRunner,
+    "peqt": PassFailModuleRunner,
+    "pebb": ModuleRunner,
+    "pbqt": ModuleRunner,
+    "iet": PassFailModuleRunner,
+    "babel": ModuleRunner,
+    "mem": MemModuleRunner,
+    "gst": PassFailModuleRunner,
+}
+"""Mapping of module to corresponding runner to use for it."""
+
+
+def parse_args():
+    """Parses command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="ROCm Validation Suite wrapper"
+    )
+    parser.add_argument(
+        "module",
+        metavar="MOD",
+        nargs="?",
+        type=str,
+        choices=RVS_MODULES.keys(),
+        help="RVS module to run [{}]".format(", ".join(RVS_MODULES.keys())),
+    )
+    parser.add_argument(
+        "-l",
+        "--list-modules",
+        action="store_true",
+        help="List supported RVS modules",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        dest="log_level",
+        action="store_const",
+        const=logging.DEBUG,
+        default=logging.INFO,
+        help="Increase logging verbosity",
+    )
+    parser.add_argument(
+        "--rvs",
+        metavar="EXE",
+        type=Path,
+        default=Path(shutil.which("rvs") or RVS_BIN),
+        help="Path to RVS binary",
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        metavar="PATH",
+        type=Path,
+        default=None,
+        help="Path to RVS module configuration",
+    )
+
+    args = parser.parse_args()
+
+    if args.list_modules:
+        parser.exit(
+            message="Modules supported: {}\n".format(" ".join(RVS_MODULES))
+        )
+    elif not args.module:
+        parser.error("--list-modules or module required")
+
+    # Add default configuration if none is provided
+    if args.config is None:
+        args.config = PLAINBOX_PROVIDER_DATA / "rvs-{}.conf".format(
+            args.module
+        )
+
+    return args
+
+
+def main():
+    """Main entrypoint of the program."""
+    args = parse_args()
+    logging.basicConfig(level=args.log_level)
+
+    logging.debug("Module to run: %s", args.module)
+    runner = RVS_MODULES[args.module](args.rvs, args.config)
+    runner.run(args.module)
+
+    logging.info("Result: PASS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/providers/gpgpu/data/rvs-babel.conf b/providers/gpgpu/data/rvs-babel.conf
@@ -0,0 +1,16 @@
+# -*- mode: yaml -*-
+
+# Tests memory transfer rates using BabelStream
+actions:
+  - name: mem_transfer
+    module: babel
+    device: all
+    parallel: true
+    num_iter: 5000
+    array_size: 33554432
+    test_type: 2
+    mibibytes: false
+    o/p_csv: false
+    subtest: 1
+
+# vim: ft=yaml
diff --git a/providers/gpgpu/data/rvs-gpup.conf b/providers/gpgpu/data/rvs-gpup.conf
@@ -0,0 +1,13 @@
+# -*- mode: yaml -*-
+
+# Displays the GPUs found and their properties
+actions:
+  - name: query
+    module: gpup
+    device: all
+    properties:
+      all:
+    io_links-properties:
+      all:
+
+# vim: ft=yaml
diff --git a/providers/gpgpu/data/rvs-gst.conf b/providers/gpgpu/data/rvs-gst.conf
@@ -0,0 +1,25 @@
+# -*- mode: yaml -*-
+
+# duration (ms): 4 hours
+# target_stress (GFLOPS): minimum stress to maintain
+# ramp_interval (ms): initial time to wait for target_stress to be reached
+# ops_type: Single-precision General Matrix Multiplication test
+actions:
+  - name: stress-4hrs
+    module: gst
+    device: all
+    parallel: true
+    count: 1
+    duration: 14400000
+    ramp_interval: 300000
+    log_interval: 6000
+    target_stress: 8000
+    max_violations: 1
+    copy_matrix: false
+    tolerance: 0.01
+    matrix_size_a: 8640
+    matrix_size_b: 8640
+    matrix_size_c: 8640
+    ops_type: sgemm
+
+# vim: ft=yaml
diff --git a/providers/gpgpu/data/rvs-iet.conf b/providers/gpgpu/data/rvs-iet.conf
@@ -0,0 +1,19 @@
+# -*- mode: yaml -*-
+
+# Tests peak power capabilities of a GPU
+actions:
+  - name: power_capabilities
+    module: iet
+    device: all
+    wait: 100
+    duration: 50000
+    ramp_interval: 5000
+    sample_interval: 700
+    log_interval: 700
+    max_violations: 1
+    target_power: 140
+    tolerance: 0.1
+    matrix_size: 8640
+    ops_type: dgemm
+
+# vim: ft=yaml
diff --git a/providers/gpgpu/data/rvs-mem.conf b/providers/gpgpu/data/rvs-mem.conf
@@ -0,0 +1,19 @@
+# -*- mode: yaml -*-
+
+# Tests the GPU memory for hardware errors and soft errors
+# Tests excluded: 9) Modulo 20, random patters; 10) Bit fade test, 90 min, 2 patterns
+actions:
+  - name: mem_test
+    device: all
+    module: mem
+    parallel: true
+    wait: 100
+    mapped_memory: false
+    mem_blocks: 128
+    num_passes: 500
+    thrds_per_blk: 64
+    stress: true
+    num_iter: 50000
+    exclude : 9 10
+
+# vim: ft=yaml
diff --git a/providers/gpgpu/data/rvs-pbqt.conf b/providers/gpgpu/data/rvs-pbqt.conf
@@ -0,0 +1,14 @@
+# -*- mode: yaml -*-
+
+# Tests for P2P compatibility and performs bandwidth tests
+actions:
+  - name: p2p_compat
+    device: all
+    module: pbqt
+    log_interval: 800
+    duration: 5000
+    peers: all
+    test_bandwidth: true
+    bidirectional: true
+
+# vim: ft=yaml
diff --git a/providers/gpgpu/data/rvs-pebb.conf b/providers/gpgpu/data/rvs-pebb.conf
@@ -0,0 +1,14 @@
+# -*- mode: yaml -*-
+
+# Tests PCIe bandwidth
+actions:
+  - name: pci_bandwidth
+    module: pebb
+    device: all
+    log_interval: 800
+    duration: 2000
+    device_to_host: false
+    host_to_device: true
+    parallel: false
+
+# vim: ft=yaml
diff --git a/providers/gpgpu/data/rvs-peqt.conf b/providers/gpgpu/data/rvs-peqt.conf
@@ -0,0 +1,28 @@
+# -*- mode: yaml -*-
+
+# Ensures the PCIe configuration is compatible with ROCm
+actions:
+  - name: pci_config
+    module: peqt
+    device: all
+    capability:
+      link_cap_max_speed:
+      link_cap_max_width:
+      link_stat_cur_speed:
+      link_stat_neg_width:
+      slot_pwr_limit_value:
+      slot_physical_num:
+      device_id:
+      vendor_id:
+      kernel_driver:
+      dev_serial_num:
+      D0_Maximum_Power_12V:
+      D0_Maximum_Power_3_3V:
+      D0_Sustained_Power_12V:
+      D0_Sustained_Power_3_3V:
+      atomic_op_routing:
+      atomic_op_32_completer:
+      atomic_op_64_completer:
+      atomic_op_128_CAS_completer:
+
+# vim: ft=yaml