From a0f2fab72bc26dfe919aff326028c7f134ec7146 Mon Sep 17 00:00:00 2001
From: Allison Piper <alliepiper16@gmail.com>
Date: Sat, 6 Apr 2024 22:05:40 +0000
Subject: [PATCH] Squashed commit of the following:

commit c5b2fc0a8b2a5332f46edfe95f94675a03eda0c1
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 21:48:20 2024 +0000

    Add supported compilers and tools in README.md.

commit 92fe366da54c81a812e43c33ab493a709d4d63df
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 20:45:30 2024 +0000

    Fix issues discovered by header tests.

commit f7f6c921437a41278931cd6ab59028c8802658ef
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 20:45:06 2024 +0000

    Setup header tests, add C++20 header tests + examples.

    The core library will always be built with C++17, but
    we test our headers / examples under 17 and 20.

commit 4b24f26b661e32b4d5ad0ff569d3dc4b0c1c58ec
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 16:21:42 2024 +0000

    Pass CUDA FLAGS to install tests.

commit 4fb672ae9115b19c3720253508f2744e8bc250a9
Author: Allison Piper <alliepiper16@gmail.com>
Date:   Sat Apr 6 15:43:41 2024 +0000

    Add newer GCC (13) and Clang (17, 18).
---
 .../cuda12.4-llvm17/devcontainer.json         | 46 ++++++++++
 .../cuda12.4-llvm18/devcontainer.json         | 46 ++++++++++
 .../actions/compute-matrix/compute-matrix.sh  | 10 +--
 .github/workflows/dispatch-build-and-test.yml | 12 +--
 CMakeLists.txt                                | 15 +++-
 CMakePresets.json                             | 30 +++----
 README.md                                     | 19 ++--
 ci/build_nvbench.sh                           |  2 +-
 ci/matrix.yaml                                | 73 ++++++++-------
 ci/test_nvbench.sh                            |  2 +-
 ci/windows/build_nvbench.ps1                  |  9 +-
 ci/windows/test_nvbench.ps1                   |  9 +-
 cmake/DetectSupportedStandards.cmake          | 65 ++++++++++++++
 cmake/NVBenchHeaderTesting.cmake              | 40 +++++++++
 cmake/header_test.in.cxx                      | 57 ++++++++++++
 examples/CMakeLists.txt                       | 89 +++++++++++--------
 nvbench/axis_base.cuh                         |  1 +
 nvbench/detail/type_list_impl.cuh             |  4 +-
 nvbench/test_kernels.cuh                      |  2 +
 testing/cmake/CMakeLists.txt                  |  1 +
 20 files changed, 405 insertions(+), 127 deletions(-)
 create mode 100644 .devcontainer/cuda12.4-llvm17/devcontainer.json
 create mode 100644 .devcontainer/cuda12.4-llvm18/devcontainer.json
 create mode 100644 cmake/DetectSupportedStandards.cmake
 create mode 100644 cmake/NVBenchHeaderTesting.cmake
 create mode 100644 cmake/header_test.in.cxx

diff --git a/.devcontainer/cuda12.4-llvm17/devcontainer.json b/.devcontainer/cuda12.4-llvm17/devcontainer.json
new file mode 100644
index 00000000..7b9f2e54
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm17/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm17-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm17",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "17",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm17"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm17"
+}
diff --git a/.devcontainer/cuda12.4-llvm18/devcontainer.json b/.devcontainer/cuda12.4-llvm18/devcontainer.json
new file mode 100644
index 00000000..ff2c1a78
--- /dev/null
+++ b/.devcontainer/cuda12.4-llvm18/devcontainer.json
@@ -0,0 +1,46 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:24.06-cpp-llvm18-cuda12.4-ubuntu22.04",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "VAULT_HOST": "https://vault.ops.k8s.rapids.ai",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.4-llvm18",
+    "CCCL_CUDA_VERSION": "12.4",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "18",
+    "CCCL_BUILD_INFIX": "cuda12.4-llvm18"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "clang-format.executable": "/usr/local/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ]
+      }
+    }
+  },
+  "name": "cuda12.4-llvm18"
+}
diff --git a/.github/actions/compute-matrix/compute-matrix.sh b/.github/actions/compute-matrix/compute-matrix.sh
index 8a6d635c..cd3946f1 100755
--- a/.github/actions/compute-matrix/compute-matrix.sh
+++ b/.github/actions/compute-matrix/compute-matrix.sh
@@ -8,21 +8,13 @@ write_output() {
   echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
 }
 
-explode_std_versions() {
-  jq -cr 'map(. as $o | {std: $o.std[]} + del($o.std))'
-}
-
-explode_libs() {
-  jq -cr 'map(. as $o | {lib: $o.lib[]} + del($o.lib))'
-}
-
 extract_matrix() {
   local file="$1"
   local type="$2"
   local matrix=$(yq -o=json "$file" | jq -cr ".$type")
   write_output "DEVCONTAINER_VERSION" "$(yq -o json "$file" | jq -cr '.devcontainer_version')"
 
-  local nvcc_full_matrix="$(echo "$matrix" | jq -cr '.nvcc' | explode_std_versions )"
+  local nvcc_full_matrix="$(echo "$matrix" | jq -cr '.nvcc')"
   local per_cuda_compiler_matrix="$(echo "$nvcc_full_matrix" | jq -cr ' group_by(.cuda + .compiler.name) | map({(.[0].cuda + "-" + .[0].compiler.name): .}) | add')"
   write_output "PER_CUDA_COMPILER_MATRIX"  "$per_cuda_compiler_matrix"
   write_output "PER_CUDA_COMPILER_KEYS" "$(echo "$per_cuda_compiler_matrix" | jq -r 'keys | @json')"
diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml
index ce54c673..72cfb6bf 100644
--- a/.github/workflows/dispatch-build-and-test.yml
+++ b/.github/workflows/dispatch-build-and-test.yml
@@ -28,9 +28,9 @@ jobs:
         include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
     with:
       cpu: ${{ matrix.cpu }}
-      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}/C++${{matrix.std}} ${{matrix.extra_build_args}}
-      build_script: "./ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} ${{matrix.extra_build_args}}"
-      test_script:  "./ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} -std ${{matrix.std}} ${{matrix.extra_build_args}}"
+      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}} ${{matrix.extra_build_args}}
+      build_script: "./ci/build_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
+      test_script:  "./ci/test_${{ inputs.project_name }}.sh -cxx ${{matrix.compiler.exe}} ${{matrix.extra_build_args}}"
       container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cpp-${{matrix.compiler.name}}${{matrix.compiler.version}}-cuda${{matrix.cuda}}-${{matrix.os}}
 
   build_and_test_windows:
@@ -45,7 +45,7 @@ jobs:
       matrix:
         include: ${{ fromJSON(inputs.per_cuda_compiler_matrix) }}
     with:
-      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}/C++${{matrix.std}}
-      build_script: "./ci/windows/build_${{ inputs.project_name }}.ps1 -std ${{matrix.std}} ${{matrix.extra_build_args}}"
-      test_script:  "./ci/windows/test_${{ inputs.project_name }}.ps1 -std ${{matrix.std}} ${{matrix.extra_build_args}}"
+      test_name: ${{matrix.cpu}}/${{matrix.compiler.name}}${{matrix.compiler.version}}
+      build_script: "./ci/windows/build_${{ inputs.project_name }}.ps1 ${{matrix.extra_build_args}}"
+      test_script:  "./ci/windows/test_${{ inputs.project_name }}.ps1 ${{matrix.extra_build_args}}"
       container_image: rapidsai/devcontainers:${{inputs.devcontainer_version}}-cuda${{matrix.cuda}}-${{matrix.compiler.name}}${{matrix.compiler.version}}-${{matrix.os}}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b052350f..8eb5f883 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,6 +21,11 @@ project(NVBench
 
 nvbench_init_rapids_cmake()
 
+# Define NVBench_DETECTED_${LANG}_STANDARDS
+include(cmake/DetectSupportedStandards.cmake)
+detect_supported_standards(NVBench CXX 17 20)
+detect_supported_standards(NVBench CUDA 17 20)
+
 # See NVIDIA/NVBench#52
 find_package(CUDAToolkit REQUIRED)
 set(cupti_default ON)
@@ -34,6 +39,7 @@ option(NVBench_ENABLE_NVML "Build with NVML support from the Cuda Toolkit." ON)
 option(NVBench_ENABLE_CUPTI "Build NVBench with CUPTI." ${cupti_default})
 
 option(NVBench_ENABLE_TESTING "Build NVBench testing suite." OFF)
+option(NVBench_ENABLE_HEADER_TESTING "Build NVBench testing suite." OFF)
 option(NVBench_ENABLE_DEVICE_TESTING
   "Include tests that require a GPU (with locked clocks)."
   OFF
@@ -55,7 +61,10 @@ message(STATUS "NVBench CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
 add_subdirectory(nvbench)
 
-if (NVBench_ENABLE_EXAMPLES OR NVBench_ENABLE_TESTING)
+if (NVBench_ENABLE_EXAMPLES OR
+    NVBench_ENABLE_TESTING OR
+    NVBench_ENABLE_HEADER_TESTING)
+  include(CTest)
   enable_testing()
 endif()
 
@@ -69,4 +78,8 @@ if (NVBench_ENABLE_TESTING)
   add_subdirectory(testing)
 endif()
 
+if (NVBench_ENABLE_HEADER_TESTING)
+  include(cmake/NVBenchHeaderTesting.cmake)
+endif()
+
 nvbench_generate_exports()
diff --git a/CMakePresets.json b/CMakePresets.json
index 42e24428..3e66f9ad 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,6 +17,7 @@
         "NVBench_ENABLE_CUPTI": true,
         "NVBench_ENABLE_DEVICE_TESTING": false,
         "NVBench_ENABLE_EXAMPLES": true,
+        "NVBench_ENABLE_HEADER_TESTING": true,
         "NVBench_ENABLE_INSTALL_RULES": true,
         "NVBench_ENABLE_NVML": true,
         "NVBench_ENABLE_TESTING": true,
@@ -24,30 +25,27 @@
       }
     },
     {
-      "name": "all-dev",
+      "name": "nvbench-dev",
+      "displayName": "Developer Build",
       "inherits": "base",
       "cacheVariables": {
         "NVBench_ENABLE_DEVICE_TESTING": true
       }
     },
     {
-      "name": "nvbench-cpp17",
-      "displayName": "nvbench_c++17",
-      "inherits": "base",
-      "cacheVariables": {
-        "CMAKE_CXX_STANDARD": "17",
-        "CMAKE_CUDA_STANDARD": "17"
-      }
+      "name": "nvbench-ci",
+      "displayName": "NVBench CI",
+      "inherits": "base"
     }
   ],
   "buildPresets": [
     {
-      "name": "all-dev",
-      "configurePreset": "all-dev"
+      "name": "nvbench-dev",
+      "configurePreset": "nvbench-dev"
     },
     {
-      "name": "nvbench-cpp17",
-      "configurePreset": "nvbench-cpp17"
+      "name": "nvbench-ci",
+      "configurePreset": "nvbench-ci"
     }
   ],
   "testPresets": [
@@ -63,13 +61,13 @@
       }
     },
     {
-      "name": "all-dev",
-      "configurePreset": "all-dev",
+      "name": "nvbench-dev",
+      "configurePreset": "nvbench-dev",
       "inherits": "base"
     },
     {
-      "name": "nvbench-cpp17",
-      "configurePreset": "nvbench-cpp17",
+      "name": "nvbench-ci",
+      "configurePreset": "nvbench-ci",
       "inherits": "base"
     }
   ]
diff --git a/README.md b/README.md
index c1cad5ad..285213f1 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,15 @@ features:
     * Executes the benchmark multiple times back-to-back and records total time.
     * Reports the average execution time (total time / number of executions).
 
+# Supported Compilers and Tools
+
+- CMake > 2.23.1
+- CUDA Toolkit + nvcc: 11.1 -> 12.4
+- g++: 7 -> 12
+- clang++: 9 -> 18
+- cl.exe: 2019 -> 2022 (19.29, 29.39)
+- Headers are tested with C++17 -> C++20.
+
 # Getting Started
 
 ## Minimal Benchmark
@@ -34,7 +43,7 @@ A basic kernel benchmark can be created with just a few lines of CUDA C++:
 
 ```cpp
 void my_benchmark(nvbench::state& state) {
-  state.exec([](nvbench::launch& launch) { 
+  state.exec([](nvbench::launch& launch) {
     my_kernel<<<num_blocks, 256, 0, launch.get_stream()>>>();
   });
 }
@@ -72,7 +81,7 @@ mkdir -p build
 cd build
 cmake -DNVBench_ENABLE_EXAMPLES=ON -DCMAKE_CUDA_ARCHITECTURES=70 .. && make
 ```
-Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on. 
+Be sure to set `CMAKE_CUDA_ARCHITECTURE` based on the GPU you are running on.
 
 Examples are built by default into `build/bin` and are prefixed with `nvbench.example`.
 
@@ -119,7 +128,7 @@ Pass: Batch: 0.261963ms GPU, 7.18s total GPU, 27394x
 ## Demo Project
 
 To get started using NVBench with your own kernels, consider trying out
-the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo). 
+the [NVBench Demo Project](https://github.com/allisonvacanti/nvbench_demo).
 
 `nvbench_demo` provides a simple CMake project that uses NVBench to build an
 example benchmark. It's a great way to experiment with the library without a lot
@@ -129,7 +138,7 @@ of investment.
 
 Contributions are welcome!
 
-For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors. 
+For current issues, see the [issue board](https://github.com/NVIDIA/nvbench/issues). Issues labeled with [![](https://img.shields.io/github/labels/NVIDIA/nvbench/good%20first%20issue)](https://github.com/NVIDIA/nvbench/labels/good%20first%20issue) are good for first time contributors.
 
 ## Tests
 
@@ -146,7 +155,7 @@ To run all tests:
 ```
 make test
 ```
-or 
+or
 ```
 ctest
 ```
diff --git a/ci/build_nvbench.sh b/ci/build_nvbench.sh
index ecd06289..cc245d3a 100755
--- a/ci/build_nvbench.sh
+++ b/ci/build_nvbench.sh
@@ -4,7 +4,7 @@ source "$(dirname "$0")/build_common.sh"
 
 print_environment_details
 
-PRESET="nvbench-cpp$CXX_STANDARD"
+PRESET="nvbench-ci"
 
 CMAKE_OPTIONS=""
 
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index a1bfb570..99594730 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -14,6 +14,7 @@ gcc9: &gcc9 { name: 'gcc', version: '9', exe: 'g++' }
 gcc10: &gcc10 { name: 'gcc', version: '10', exe: 'g++' }
 gcc11: &gcc11 { name: 'gcc', version: '11', exe: 'g++' }
 gcc12: &gcc12 { name: 'gcc', version: '12', exe: 'g++' }
+gcc12: &gcc13 { name: 'gcc', version: '13', exe: 'g++' }
 
 # LLVM Compiler configurations
 llvm9: &llvm9 { name: 'llvm', version: '9', exe: 'clang++' }
@@ -24,6 +25,8 @@ llvm13: &llvm13 { name: 'llvm', version: '13', exe: 'clang++' }
 llvm14: &llvm14 { name: 'llvm', version: '14', exe: 'clang++' }
 llvm15: &llvm15 { name: 'llvm', version: '15', exe: 'clang++' }
 llvm16: &llvm16 { name: 'llvm', version: '16', exe: 'clang++' }
+llvm16: &llvm17 { name: 'llvm', version: '17', exe: 'clang++' }
+llvm16: &llvm18 { name: 'llvm', version: '18', exe: 'clang++' }
 
 # MSVC configs
 msvc2019: &msvc2019 { name: 'cl', version: '14.29', exe: 'cl++' }
@@ -44,36 +47,40 @@ msvc2022: &msvc2022 { name: 'cl', version: '14.39', exe: 'cl++' }
 # Configurations that will run for every PR
 pull_request:
   nvcc:
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
-    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [17]}
-    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7,     std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8,     std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9,     std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10,    std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11,    std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12,    std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9,    std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16,   std: [17]}
-    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
-    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, std: [17], extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc7,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc8,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *gcc9,     extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_min, os: 'ubuntu18.04', cpu: 'amd64', compiler: *llvm9,    extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF'"}
+    - {cuda: *cuda_prev_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10}
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
+    - {cuda: *cuda_curr_min, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13}
+    - {cuda: *cuda_curr_min, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc7}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc8}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc9}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *gcc10}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc11}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc12}
+    # Fails to compile simple input on CTK12.4. Try to add later.
+    # {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *gcc13}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm9}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm10}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm11}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm12}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm13}
+    - {cuda: *cuda_curr_max, os: 'ubuntu20.04', cpu: 'amd64', compiler: *llvm14}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm15}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm16}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm17}
+    - {cuda: *cuda_curr_max, os: 'ubuntu22.04', cpu: 'amd64', compiler: *llvm18,   extra_build_args: "-cmake-options '-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler'"}
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2019, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
+    - {cuda: *cuda_curr_max, os: 'windows2022', cpu: 'amd64', compiler: *msvc2022, extra_build_args: "-cmake-options '-DNVBench_ENABLE_CUPTI=OFF -DNVBench_ENABLE_NVML=OFF'"}
diff --git a/ci/test_nvbench.sh b/ci/test_nvbench.sh
index f89c6fe6..40559eda 100755
--- a/ci/test_nvbench.sh
+++ b/ci/test_nvbench.sh
@@ -11,7 +11,7 @@ print_environment_details
 
 ./build_nvbench.sh "$@"
 
-PRESET="nvbench-cpp$CXX_STANDARD"
+PRESET="nvbench-ci"
 
 test_preset "NVBench" ${PRESET}
 
diff --git a/ci/windows/build_nvbench.ps1 b/ci/windows/build_nvbench.ps1
index e2a90a25..1ac8bd16 100644
--- a/ci/windows/build_nvbench.ps1
+++ b/ci/windows/build_nvbench.ps1
@@ -1,10 +1,5 @@
 
 Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("std")]
-    [ValidateNotNullOrEmpty()]
-    [ValidateSet(17)]
-    [int]$CXX_STANDARD = 17,
     [Parameter(Mandatory = $false)]
     [Alias("cmake-options")]
     [ValidateNotNullOrEmpty()]
@@ -18,9 +13,9 @@ If($CURRENT_PATH -ne "ci") {
 }
 
 Remove-Module -Name build_common
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList 17
 
-$PRESET = "nvbench-cpp$CXX_STANDARD"
+$PRESET = "nvbench-ci"
 $CMAKE_OPTIONS = ""
 
 # Append any arguments pass in on the command line
diff --git a/ci/windows/test_nvbench.ps1 b/ci/windows/test_nvbench.ps1
index 57ccd8e8..bcd9f2c9 100644
--- a/ci/windows/test_nvbench.ps1
+++ b/ci/windows/test_nvbench.ps1
@@ -1,10 +1,5 @@
 
 Param(
-    [Parameter(Mandatory = $true)]
-    [Alias("std")]
-    [ValidateNotNullOrEmpty()]
-    [ValidateSet(17)]
-    [int]$CXX_STANDARD = 17,
     [Parameter(Mandatory = $false)]
     [Alias("cmake-options")]
     [ValidateNotNullOrEmpty()]
@@ -18,9 +13,9 @@ If($CURRENT_PATH -ne "ci") {
 }
 
 Remove-Module -Name build_common
-Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList $CXX_STANDARD
+Import-Module $PSScriptRoot/build_common.psm1 -ArgumentList 17
 
-$PRESET = "nvbench-cpp$CXX_STANDARD"
+$PRESET = "nvbench-ci"
 $CMAKE_OPTIONS = ""
 
 # Append any arguments pass in on the command line
diff --git a/cmake/DetectSupportedStandards.cmake b/cmake/DetectSupportedStandards.cmake
new file mode 100644
index 00000000..6a86d6ac
--- /dev/null
+++ b/cmake/DetectSupportedStandards.cmake
@@ -0,0 +1,65 @@
+# Detect the langauge standards supported by the current compilers.
+#
+# Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
+#
+# - var_prefix: Used to name result variables,
+#   e.g. ${var_prefix}_${lang}_XX_SUPPORTED will be TRUE or FALSE. Defined for
+#   each XX in ${standards}.
+# - lang: The language to test: C, CXX, or CUDA.
+# - standards: List of any standard versions.
+#
+# Example: detect_supported_standards(PROJ CXX 11 14 17)
+#   - Sets the following variables in the parent scope to TRUE or FALSE:
+#     - PROJ_CXX_11_SUPPORTED
+#     - PROJ_CXX_14_SUPPORTED
+#     - PROJ_CXX_17_SUPPORTED
+#   - Sets `PROJ_DETECTED_CXX_STANDARDS` to a list of supported standards (e.g. "11;14;17").
+function(detect_supported_standards prefix lang)
+  string(TOLOWER "${lang}_std" feature_prefix)
+  set(all_stds)
+  foreach(standard IN LISTS ARGN)
+    set(var_name "${prefix}_${lang}_${standard}_SUPPORTED")
+    if ("${feature_prefix}_${standard}" IN_LIST CMAKE_${lang}_COMPILE_FEATURES)
+      set(${var_name} TRUE)
+    else()
+      set(${var_name} FALSE)
+    endif()
+
+    # Special cases:
+    if (standard EQUAL 17 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)))
+      # gcc < 7 and clang < 8 don't fully support C++17.
+      # They accept the flag and have partial support, but nvcc will refuse
+      # to enable it and falls back to the default dialect for the current
+      # CXX compiler version. This breaks our CI.
+      # CMake's COMPILE_FEATURES var reports that these compilers support C++17,
+      # but we can't rely on it, so manually disable the dialect in these cases.
+      set(${var_name} FALSE)
+    endif()
+
+    if (standard EQUAL 20 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1930)))
+      # Similar to the above, but for C++20.
+      set(${var_name} FALSE)
+    endif()
+
+    if (${var_name})
+      list(APPEND all_stds ${standard})
+    endif()
+
+    message(STATUS "Testing ${lang}${standard} Support: ${${var_name}}")
+    set(${var_name} ${${var_name}} PARENT_SCOPE)
+  endforeach()
+
+  set(${prefix}_DETECTED_${lang}_STANDARDS "${all_stds}" PARENT_SCOPE)
+endfunction()
diff --git a/cmake/NVBenchHeaderTesting.cmake b/cmake/NVBenchHeaderTesting.cmake
new file mode 100644
index 00000000..354ec84d
--- /dev/null
+++ b/cmake/NVBenchHeaderTesting.cmake
@@ -0,0 +1,40 @@
+# For every public header, build a translation unit containing `#include <header>`
+# with some various checks.
+
+set(excluded_headers_regexes
+  # Should never be used externally.
+  "^detail"
+  "^internal"
+)
+
+# Meta target for all configs' header builds:
+add_custom_target(nvbench.headers.all)
+add_dependencies(nvbench.all nvbench.headers.all)
+
+file(GLOB_RECURSE header_files
+  RELATIVE "${NVBench_SOURCE_DIR}/nvbench/"
+  CONFIGURE_DEPENDS
+  "${NVBench_SOURCE_DIR}/nvbench/*.cuh"
+)
+
+foreach (exclusion IN LISTS excluded_headers_regexes)
+  list(FILTER header_files EXCLUDE REGEX "${exclusion}")
+endforeach()
+
+function (nvbench_add_header_target target_name cuda_std)
+  foreach (header IN LISTS header_files)
+    set(headertest_src "headers/${target_name}/${header}.cu")
+    set(header_str "nvbench/${header}") # Substitution used by configure_file:
+    configure_file("${NVBench_SOURCE_DIR}/cmake/header_test.in.cxx" "${headertest_src}")
+    list(APPEND headertest_srcs "${headertest_src}")
+  endforeach()
+
+  add_library(${target_name} OBJECT ${headertest_srcs})
+  target_link_libraries(${target_name} PUBLIC nvbench::nvbench)
+  set_target_properties(${target_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
+  add_dependencies(nvbench.headers.all ${target_name})
+endfunction()
+
+foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
+  nvbench_add_header_target(nvbench.headers.cpp${std} ${std})
+endforeach()
diff --git a/cmake/header_test.in.cxx b/cmake/header_test.in.cxx
new file mode 100644
index 00000000..c26753e1
--- /dev/null
+++ b/cmake/header_test.in.cxx
@@ -0,0 +1,57 @@
+// This source file checks that:
+// 1) Header <${header_str}> compiles without error.
+// 2) Common macro collisions with platform/system headers are avoided.
+
+// Turn off failures for certain configurations:
+#ifndef NVBench_IGNORE_MACRO_CHECKS
+
+// Define NVBench_MACRO_CHECK(macro, header), which emits a diagnostic indicating
+// a potential macro collision and halts.
+//
+// Hacky way to build a string, but it works on all tested platforms.
+#define NVBench_MACRO_CHECK(MACRO, HEADER)                                      \
+  NVBench_MACRO_CHECK_IMPL(Identifier MACRO should not be used from NVBench      \
+                           headers due to conflicts with HEADER macros.)
+
+// Use raw platform checks instead of the NVBench_HOST_COMPILER macros since we
+// don't want to #include any headers other than the one being tested.
+//
+// This is only implemented for MSVC/GCC/Clang.
+#if defined(_MSC_VER) // MSVC
+
+// Fake up an error for MSVC
+#define NVBench_MACRO_CHECK_IMPL(msg)                                           \
+  /* Print message that looks like an error: */                                \
+  __pragma(message(__FILE__ ":" NVBench_MACRO_CHECK_IMPL0(__LINE__)             \
+                   ": error: " #msg))                                          \
+  /* abort compilation due to static_assert or syntax error: */                \
+  static_assert(false, #msg);
+#define NVBench_MACRO_CHECK_IMPL0(x) NVBench_MACRO_CHECK_IMPL1(x)
+#define NVBench_MACRO_CHECK_IMPL1(x) #x
+
+#elif defined(__clang__) || defined(__GNUC__)
+
+// GCC/clang are easy:
+#define NVBench_MACRO_CHECK_IMPL(msg) NVBench_MACRO_CHECK_IMPL0(GCC error #msg)
+#define NVBench_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
+
+#endif
+
+// complex.h conflicts
+#define I NVBench_MACRO_CHECK('I', complex.h)
+
+// windows.h conflicts
+#define small NVBench_MACRO_CHECK('small', windows.h)
+// We can't enable these checks without breaking some builds -- some standard
+// library implementations unconditionally `#undef` these macros, which then
+// causes random failures later.
+// Leaving these commented out as a warning: Here be dragons.
+//#define min(...) NVBench_MACRO_CHECK('min', windows.h)
+//#define max(...) NVBench_MACRO_CHECK('max', windows.h)
+
+// termios.h conflicts (NVIDIA/thrust#1547)
+#define B0 NVBench_MACRO_CHECK("B0", termios.h)
+
+#endif // NVBench_IGNORE_MACRO_CHECKS
+
+#include <${header_str}>
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b0f288c4..a98bcbeb 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -14,47 +14,58 @@ set(example_srcs
 add_custom_target(nvbench.example.all)
 add_dependencies(nvbench.all nvbench.example.all)
 
-foreach(example_src IN LISTS example_srcs)
-  get_filename_component(example_name "${example_src}" NAME_WLE)
-  string(PREPEND example_name "nvbench.example.")
-  add_executable(${example_name} "${example_src}")
-  nvbench_config_target(${example_name})
-  target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
-  target_link_libraries(${example_name} PRIVATE nvbench::main)
-  set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_17)
-  add_test(NAME ${example_name}
-    COMMAND "$<TARGET_FILE:${example_name}>" --timeout 0.1 --min-time 1e-5
-  )
 
-  # These should not deadlock. If they do, it may be that the CUDA context was created before
-  # setting CUDA_MODULE_LOAD=EAGER in main, see NVIDIA/nvbench#136.
-  set_tests_properties(${example_name} PROPERTIES
-    FAIL_REGULAR_EXPRESSION "Possible Deadlock Detected"
-  )
+function (nvbench_add_examples_target target_prefix cuda_std)
+  add_custom_target(${target_prefix}.all)
+  add_dependencies(nvbench.example.all ${target_prefix}.all)
 
-  add_dependencies(nvbench.example.all ${example_name})
-endforeach()
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    string(PREPEND example_name "${target_prefix}.")
+    add_executable(${example_name} "${example_src}")
+    nvbench_config_target(${example_name})
+    target_include_directories(${example_name} PRIVATE "${CMAKE_CURRENT_LIST_DIR}")
+    target_link_libraries(${example_name} PRIVATE nvbench::main)
+    set_target_properties(${example_name} PROPERTIES COMPILE_FEATURES cuda_std_${cuda_std})
+    add_test(NAME ${example_name}
+      COMMAND "$<TARGET_FILE:${example_name}>" --timeout 0.1 --min-time 1e-5
+    )
 
-# Silence some warnings from old thrust headers:
-set(thrust_examples
-  auto_throughput
-  axes
-  custom_criterion
-  exec_tag_sync
-  exec_tag_timer
-  skip
-  stream
-  throughput
-)
-foreach (example IN LISTS thrust_examples)
-  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-    # C4324: structure was padded due to alignment specifier
-    nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4324")
-
-    # warning C4201: nonstandard extension used: nameless struct/union:
-    # Fixed in Thrust 1.12.0 (CTK 11.4, NV HPC 21.3)
-    if (${CUDAToolkit_VERSION} VERSION_LESS 11.4)
-      nvbench_add_cxx_flag(nvbench.example.${example} PRIVATE "/wd4201")
+    # These should not deadlock. If they do, it may be that the CUDA context was created before
+    # setting CUDA_MODULE_LOAD=EAGER in main, see NVIDIA/nvbench#136.
+    set_tests_properties(${example_name} PROPERTIES
+      FAIL_REGULAR_EXPRESSION "Possible Deadlock Detected"
+    )
+
+    add_dependencies(${target_prefix}.all ${example_name})
+  endforeach()
+
+  # Silence some warnings from old thrust headers:
+  set(thrust_examples
+    auto_throughput
+    axes
+    custom_criterion
+    exec_tag_sync
+    exec_tag_timer
+    skip
+    stream
+    throughput
+  )
+  foreach (example IN LISTS thrust_examples)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+      # C4324: structure was padded due to alignment specifier
+      nvbench_add_cxx_flag(${target_prefix}.${example} PRIVATE "/wd4324")
+
+      # warning C4201: nonstandard extension used: nameless struct/union:
+      # Fixed in Thrust 1.12.0 (CTK 11.4, NV HPC 21.3)
+      if (${CUDAToolkit_VERSION} VERSION_LESS 11.4)
+        nvbench_add_cxx_flag(${target_prefix}.${example} PRIVATE "/wd4201")
+      endif()
     endif()
-  endif()
+  endforeach()
+endfunction()
+
+
+foreach (std IN LISTS NVBench_DETECTED_CUDA_STANDARDS)
+  nvbench_add_examples_target(nvbench.example.cpp${std} ${std})
 endforeach()
diff --git a/nvbench/axis_base.cuh b/nvbench/axis_base.cuh
index 85d92c7d..b3e089ec 100644
--- a/nvbench/axis_base.cuh
+++ b/nvbench/axis_base.cuh
@@ -21,6 +21,7 @@
 #include <memory>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <utility>
 
 namespace nvbench
diff --git a/nvbench/detail/type_list_impl.cuh b/nvbench/detail/type_list_impl.cuh
index 8a18aa3d..e97aaaa1 100644
--- a/nvbench/detail/type_list_impl.cuh
+++ b/nvbench/detail/type_list_impl.cuh
@@ -22,8 +22,8 @@ namespace tl::detail
 template <typename... Ts>
 auto size(nvbench::type_list<Ts...>) -> std::integral_constant<std::size_t, sizeof...(Ts)>;
 
-template <std::size_t I, typename... Ts>
-auto get(nvbench::type_list<Ts...>) -> std::tuple_element_t<I, std::tuple<Ts...>>;
+template <std::size_t Idx, typename... Ts>
+auto get(nvbench::type_list<Ts...>) -> std::tuple_element_t<Idx, std::tuple<Ts...>>;
 
 template <typename... Ts, typename... Us>
 auto concat(nvbench::type_list<Ts...>, nvbench::type_list<Us...>)
diff --git a/nvbench/test_kernels.cuh b/nvbench/test_kernels.cuh
index f01305c8..f46216dc 100644
--- a/nvbench/test_kernels.cuh
+++ b/nvbench/test_kernels.cuh
@@ -18,6 +18,8 @@
 
 #pragma once
 
+#include <nvbench/types.cuh>
+
 #include <cuda/std/chrono>
 
 #include <cuda_runtime.h>
diff --git a/testing/cmake/CMakeLists.txt b/testing/cmake/CMakeLists.txt
index 2cb2f5fa..6932c00c 100644
--- a/testing/cmake/CMakeLists.txt
+++ b/testing/cmake/CMakeLists.txt
@@ -12,6 +12,7 @@ set(cmake_opts
   -D "CMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}"
   -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
   -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+  -D "CMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}"
   -D "CMAKE_CUDA_ARCHITECTURES=${arches}"
 )