diff --git a/.devcontainer/cuda12.6-llvm14/devcontainer.json b/.devcontainer/cuda12.6-llvm14/devcontainer.json
deleted file mode 100644
index 16d79694534..00000000000
--- a/.devcontainer/cuda12.6-llvm14/devcontainer.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm14-cuda12.6",
-  "hostRequirements": {
-    "gpu": "optional"
-  },
-  "initializeCommand": [
-    "/bin/bash",
-    "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
-    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
-  ],
-  "containerEnv": {
-    "SCCACHE_REGION": "us-east-2",
-    "SCCACHE_BUCKET": "rapids-sccache-devs",
-    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
-    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm14",
-    "CCCL_CUDA_VERSION": "12.6",
-    "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "14",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm14",
-    "CCCL_CUDA_EXTENDED": "false"
-  },
-  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
-  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
-  "mounts": [
-    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=cccl-build,target=/home/coder/cccl/build"
-  ],
-  "customizations": {
-    "vscode": {
-      "extensions": [
-        "llvm-vs-code-extensions.vscode-clangd",
-        "xaver.clang-format",
-        "nvidia.nsight-vscode-edition",
-        "ms-vscode.cmake-tools"
-      ],
-      "settings": {
-        "editor.defaultFormatter": "xaver.clang-format",
-        "editor.formatOnSave": true,
-        "clang-format.executable": "/usr/bin/clang-format",
-        "clangd.arguments": [
-          "--compile-commands-dir=${workspaceFolder}"
-        ],
-        "files.eol": "\n",
-        "files.trimTrailingWhitespace": true
-      }
-    }
-  },
-  "name": "cuda12.6-llvm14"
-}
diff --git a/.devcontainer/cuda12.6-gcc10/devcontainer.json b/.devcontainer/cuda12.8-gcc10/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-gcc10/devcontainer.json
rename to .devcontainer/cuda12.8-gcc10/devcontainer.json
index 8ecf9dfa038..5fa59ac052f 100644
--- a/.devcontainer/cuda12.6-gcc10/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc10/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc10-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc10-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc10",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc10",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "10",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc10",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc10",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc10"
+  "name": "cuda12.8-gcc10"
 }
diff --git a/.devcontainer/cuda12.6-gcc11/devcontainer.json b/.devcontainer/cuda12.8-gcc11/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-gcc11/devcontainer.json
rename to .devcontainer/cuda12.8-gcc11/devcontainer.json
index 2f26a6e4d4f..e6408c727e9 100644
--- a/.devcontainer/cuda12.6-gcc11/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc11/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc11-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc11-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc11",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc11",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "11",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc11",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc11",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc11"
+  "name": "cuda12.8-gcc11"
 }
diff --git a/.devcontainer/cuda12.6-gcc12/devcontainer.json b/.devcontainer/cuda12.8-gcc12/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-gcc12/devcontainer.json
rename to .devcontainer/cuda12.8-gcc12/devcontainer.json
index 208dd6f76e1..1538817d17b 100644
--- a/.devcontainer/cuda12.6-gcc12/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc12/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc12-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc12-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc12",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc12",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "12",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc12",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc12",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc12"
+  "name": "cuda12.8-gcc12"
 }
diff --git a/.devcontainer/cuda12.6-gcc13/devcontainer.json b/.devcontainer/cuda12.8-gcc13/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-gcc13/devcontainer.json
rename to .devcontainer/cuda12.8-gcc13/devcontainer.json
index 1c7203a1a3c..0cf38c8b80b 100644
--- a/.devcontainer/cuda12.6-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc13-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc13-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc13",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc13",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc13",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc13",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc13"
+  "name": "cuda12.8-gcc13"
 }
diff --git a/.devcontainer/cuda12.6-gcc7/devcontainer.json b/.devcontainer/cuda12.8-gcc7/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-gcc7/devcontainer.json
rename to .devcontainer/cuda12.8-gcc7/devcontainer.json
index 6e9aa206662..5ca28834c3b 100644
--- a/.devcontainer/cuda12.6-gcc7/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc7/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc7-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc7",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc7",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "7",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc7",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc7",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc7"
+  "name": "cuda12.8-gcc7"
 }
diff --git a/.devcontainer/cuda12.6-gcc8/devcontainer.json b/.devcontainer/cuda12.8-gcc8/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-gcc8/devcontainer.json
rename to .devcontainer/cuda12.8-gcc8/devcontainer.json
index a862b1656e5..eefd8226a3e 100644
--- a/.devcontainer/cuda12.6-gcc8/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc8/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc8-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc8",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc8",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "8",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc8",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc8",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc8"
+  "name": "cuda12.8-gcc8"
 }
diff --git a/.devcontainer/cuda12.6-gcc9/devcontainer.json b/.devcontainer/cuda12.8-gcc9/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-gcc9/devcontainer.json
rename to .devcontainer/cuda12.8-gcc9/devcontainer.json
index 39f2d58b97e..abad6fcb3b5 100644
--- a/.devcontainer/cuda12.6-gcc9/devcontainer.json
+++ b/.devcontainer/cuda12.8-gcc9/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc9-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc9-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc9",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc9",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "9",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc9",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc9",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc9"
+  "name": "cuda12.8-gcc9"
 }
diff --git a/.devcontainer/cuda12.6-llvm18/devcontainer.json b/.devcontainer/cuda12.8-llvm14/devcontainer.json
similarity index 91%
rename from .devcontainer/cuda12.6-llvm18/devcontainer.json
rename to .devcontainer/cuda12.8-llvm14/devcontainer.json
index 077066d1727..98e36c7f538 100644
--- a/.devcontainer/cuda12.6-llvm18/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm14/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm18-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-llvm14-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm18",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm14",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "llvm",
-    "CCCL_HOST_COMPILER_VERSION": "18",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm18",
+    "CCCL_HOST_COMPILER_VERSION": "14",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm14",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-llvm18"
+  "name": "cuda12.8-llvm14"
 }
diff --git a/.devcontainer/cuda12.6-llvm15/devcontainer.json b/.devcontainer/cuda12.8-llvm15/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-llvm15/devcontainer.json
rename to .devcontainer/cuda12.8-llvm15/devcontainer.json
index 4cda886b473..cee653c0c00 100644
--- a/.devcontainer/cuda12.6-llvm15/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm15/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm15-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-llvm15-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm15",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm15",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "llvm",
     "CCCL_HOST_COMPILER_VERSION": "15",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm15",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm15",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-llvm15"
+  "name": "cuda12.8-llvm15"
 }
diff --git a/.devcontainer/cuda12.6-llvm16/devcontainer.json b/.devcontainer/cuda12.8-llvm16/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-llvm16/devcontainer.json
rename to .devcontainer/cuda12.8-llvm16/devcontainer.json
index d85655f22cf..d7f0dce7566 100644
--- a/.devcontainer/cuda12.6-llvm16/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm16/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm16-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-llvm16-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm16",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm16",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "llvm",
     "CCCL_HOST_COMPILER_VERSION": "16",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm16",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm16",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-llvm16"
+  "name": "cuda12.8-llvm16"
 }
diff --git a/.devcontainer/cuda12.6-llvm17/devcontainer.json b/.devcontainer/cuda12.8-llvm17/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6-llvm17/devcontainer.json
rename to .devcontainer/cuda12.8-llvm17/devcontainer.json
index 43d7a88e4bb..d4d3851dd85 100644
--- a/.devcontainer/cuda12.6-llvm17/devcontainer.json
+++ b/.devcontainer/cuda12.8-llvm17/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-llvm17-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-llvm17-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-llvm17",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm17",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "llvm",
     "CCCL_HOST_COMPILER_VERSION": "17",
-    "CCCL_BUILD_INFIX": "cuda12.6-llvm17",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm17",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-llvm17"
+  "name": "cuda12.8-llvm17"
 }
diff --git a/.devcontainer/cuda12.8-llvm18/devcontainer.json b/.devcontainer/cuda12.8-llvm18/devcontainer.json
new file mode 100644
index 00000000000..e2e27cbc08c
--- /dev/null
+++ b/.devcontainer/cuda12.8-llvm18/devcontainer.json
@@ -0,0 +1,54 @@
+{
+  "shutdownAction": "stopContainer",
+  "image": "rapidsai/devcontainers:25.02-cpp-llvm18-cuda12.8",
+  "hostRequirements": {
+    "gpu": "optional"
+  },
+  "initializeCommand": [
+    "/bin/bash",
+    "-c",
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/.{aws,cache,config}; mkdir -m 0755 -p ${localWorkspaceFolder}/build;",
+    "if [[ -n ${WSLENV+set} ]]; then docker volume create cccl-build; else docker volume create --driver local --opt type=none --opt device=${localWorkspaceFolder}/build --opt o=bind cccl-build fi;"
+  ],
+  "containerEnv": {
+    "SCCACHE_REGION": "us-east-2",
+    "SCCACHE_BUCKET": "rapids-sccache-devs",
+    "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
+    "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
+    "DEVCONTAINER_NAME": "cuda12.8-llvm18",
+    "CCCL_CUDA_VERSION": "12.8",
+    "CCCL_HOST_COMPILER": "llvm",
+    "CCCL_HOST_COMPILER_VERSION": "18",
+    "CCCL_BUILD_INFIX": "cuda12.8-llvm18",
+    "CCCL_CUDA_EXTENDED": "false"
+  },
+  "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
+  "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/${localWorkspaceFolderBasename},type=bind,consistency=consistent",
+  "mounts": [
+    "source=${localWorkspaceFolder}/.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/.config,target=/home/coder/.config,type=bind,consistency=consistent",
+    "source=cccl-build,target=/home/coder/cccl/build"
+  ],
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "llvm-vs-code-extensions.vscode-clangd",
+        "xaver.clang-format",
+        "nvidia.nsight-vscode-edition",
+        "ms-vscode.cmake-tools"
+      ],
+      "settings": {
+        "editor.defaultFormatter": "xaver.clang-format",
+        "editor.formatOnSave": true,
+        "clang-format.executable": "/usr/bin/clang-format",
+        "clangd.arguments": [
+          "--compile-commands-dir=${workspaceFolder}"
+        ],
+        "files.eol": "\n",
+        "files.trimTrailingWhitespace": true
+      }
+    }
+  },
+  "name": "cuda12.8-llvm18"
+}
diff --git a/.devcontainer/cuda12.6ext-gcc13/devcontainer.json b/.devcontainer/cuda12.8ext-gcc13/devcontainer.json
similarity index 90%
rename from .devcontainer/cuda12.6ext-gcc13/devcontainer.json
rename to .devcontainer/cuda12.8ext-gcc13/devcontainer.json
index 3c83bc62d95..fe118768428 100644
--- a/.devcontainer/cuda12.6ext-gcc13/devcontainer.json
+++ b/.devcontainer/cuda12.8ext-gcc13/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc13-cuda12.6ext",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc13-cuda12.8ext",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6ext-gcc13",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8ext-gcc13",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.6ext-gcc13",
+    "CCCL_BUILD_INFIX": "cuda12.8ext-gcc13",
     "CCCL_CUDA_EXTENDED": "true"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6ext-gcc13"
+  "name": "cuda12.8ext-gcc13"
 }
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 1c7203a1a3c..0cf38c8b80b 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,6 +1,6 @@
 {
   "shutdownAction": "stopContainer",
-  "image": "rapidsai/devcontainers:25.02-cpp-gcc13-cuda12.6",
+  "image": "rapidsai/devcontainers:25.02-cpp-gcc13-cuda12.8",
   "hostRequirements": {
     "gpu": "optional"
   },
@@ -15,11 +15,11 @@
     "SCCACHE_BUCKET": "rapids-sccache-devs",
     "AWS_ROLE_ARN": "arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs",
     "HISTFILE": "${containerWorkspaceFolder}/.cache/._bash_history",
-    "DEVCONTAINER_NAME": "cuda12.6-gcc13",
-    "CCCL_CUDA_VERSION": "12.6",
+    "DEVCONTAINER_NAME": "cuda12.8-gcc13",
+    "CCCL_CUDA_VERSION": "12.8",
     "CCCL_HOST_COMPILER": "gcc",
     "CCCL_HOST_COMPILER_VERSION": "13",
-    "CCCL_BUILD_INFIX": "cuda12.6-gcc13",
+    "CCCL_BUILD_INFIX": "cuda12.8-gcc13",
     "CCCL_CUDA_EXTENDED": "false"
   },
   "workspaceFolder": "/home/coder/${localWorkspaceFolderBasename}",
@@ -50,5 +50,5 @@
       }
     }
   },
-  "name": "cuda12.6-gcc13"
+  "name": "cuda12.8-gcc13"
 }
diff --git a/.github/actions/workflow-build/build-workflow.py b/.github/actions/workflow-build/build-workflow.py
index 62b90f1472d..41ea47a96a2 100755
--- a/.github/actions/workflow-build/build-workflow.py
+++ b/.github/actions/workflow-build/build-workflow.py
@@ -930,12 +930,19 @@ def validate_tags(matrix_job, ignore_required=False):
                 error_message_with_matrix_job(matrix_job, f"Unknown tag '{tag}'")
             )
 
-    if "gpu" in matrix_job and matrix_job["gpu"] not in matrix_yaml["gpus"].keys():
-        raise Exception(
-            error_message_with_matrix_job(
-                matrix_job, f"Unknown gpu '{matrix_job['gpu']}'"
-            )
+    if "gpu" in matrix_job:
+        gpus = (
+            matrix_job["gpu"]
+            if isinstance(matrix_job["gpu"], list)
+            else [matrix_job["gpu"]]
         )
+        for gpu in gpus:
+            if gpu not in matrix_yaml["gpus"].keys():
+                raise Exception(
+                    error_message_with_matrix_job(
+                        matrix_job, f"Unknown gpu '{matrix_job['gpu']}'"
+                    )
+                )
 
 
 def set_default_tags(matrix_job):
diff --git a/.github/workflows/build-matx.yml b/.github/workflows/build-matx.yml
index 0c6ef548555..3544b8823c5 100644
--- a/.github/workflows/build-matx.yml
+++ b/.github/workflows/build-matx.yml
@@ -61,7 +61,7 @@ jobs:
 
           .devcontainer/launch.sh \
             --docker \
-            --cuda 12.6 \
+            --cuda 12.8 \
             --host gcc13 \
             --cuda-ext \
             --env VAULT_HOST= \
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 5ec715fb59b..dff3b75df8e 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -19,26 +19,24 @@ workflows:
     - {jobs: ['build'], std: 'max', cxx: ['msvc2019']}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang', 'msvc']}
     # Current CTK testing:
-    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx4090'}
-    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang'], gpu: 'rtx2080'}
-    # Disabled until we figure out the issue with the TBB dll
-   #- {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['msvc'], gpu: 'rtx4090'}
-    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['msvc'], gpu: 'rtx2080'}
+    - {jobs: ['test'],  project: ['thrust'],     std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
+    - {jobs: ['test'],  project: ['libcudacxx'], std: 'max', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Split up cub tests:
     - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
     - {jobs: ['test_lid1', 'test_lid2'],  project: ['cub'], std: 'max', cxx: ['gcc'],           gpu: 'rtxa6000'}
     - {jobs: ['test_nolid', 'test_lid0'], project: ['cub'], std: 'max', cxx: ['clang', 'msvc'], gpu: 'rtxa6000'}
-    - {jobs: ['test_lid0'],               project: ['cub'], std: 'max', cxx: 'gcc12',           gpu: 'h100', sm: 'gpu' }
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'max', cxx: ['gcc', 'clang'], cpu: 'arm64'}
-    - {jobs: ['build'], std: 'max', cxx: ['gcc'], sm: '90a'}
+    - {jobs: ['build'], std: 'max', cxx: 'gcc', sm: ['90;90a;100']}
+    - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'h100', sm: 'gpu' }
     # Test Thrust 32-bit-only dispatch here, since it's most likely to break. 64-bit-only is tested in nightly.
     - {jobs: ['test_gpu'],  project: 'thrust', cmake_options: '-DTHRUST_DISPATCH_TYPE=Force32bit', gpu: 'rtx4090'}
     # default_projects: clang-cuda
     - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
-    - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
-    - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
+    # Disabled; see discussion on #3633. Should be fixed in clang-20.
+#    - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90'}
+#    - {jobs: ['build'], project: 'libcudacxx', std: 'max', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
     # nvrtc:
     - {jobs: ['nvrtc'], project: 'libcudacxx', std: 'all', gpu: 'rtx2080', sm: 'gpu'}
     # verify-codegen:
@@ -54,7 +52,7 @@ workflows:
     - {jobs: ['build'], project: 'cudax', ctk: ['curr'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['test'],  project: 'cudax', ctk: ['curr'], std: 20,    cxx: ['gcc12', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Python and c/parallel jobs:
-    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], ctk: '12.6', gpu: 'rtx2080'}
+    - {jobs: ['test'], project: ['cccl_c_parallel', 'python'], gpu: 'rtx2080'}
     # cccl-infra:
     - {jobs: ['infra'], project: 'cccl', ctk: '12.0', cxx: ['gcc12', 'clang14'], gpu: 'rtx2080'}
     - {jobs: ['infra'], project: 'cccl', ctk: 'curr', cxx: ['gcc',   'clang'],   gpu: 'rtx2080'}
@@ -73,18 +71,18 @@ workflows:
     - {jobs: ['build'], std: 'all', cxx: ['msvc2019']}
     # Test current CTK
     - {jobs: ['test'],      project: 'cub',        std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtxa6000'}
-    - {jobs: ['test_lid0'], project: 'cub',        std: 'max', cxx: 'gcc',                    gpu: 'v100'}
-    - {jobs: ['test_lid0'], project: 'cub',        std: 'max', cxx: 'gcc',                    gpu: 'h100', sm: 'gpu' }
     - {jobs: ['test'],      project: 'thrust',     std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx4090'}
     - {jobs: ['test'],      project: 'libcudacxx', std: 'all', cxx: ['gcc', 'clang', 'msvc'], gpu: 'rtx2080'}
     # Modded builds:
     - {jobs: ['build'], std: 'all', ctk: '12.5', cxx: 'nvhpc'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc', 'clang'], cpu: 'arm64'}
     - {jobs: ['build'], std: 'all', cxx: ['gcc'], sm: '90a'}
+    - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: ['v100', 't4', 'l4', 'h100'], sm: '70;75;89;90;100'}
     # default_projects: clang-cuda
     - {jobs: ['build'], std: 'all', cudacxx: 'clang', cxx: 'clang'}
-    - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90'}
-    - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
+    # Disabled; see discussion on #3633. Should be fixed in clang-20.
+    # - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90'}
+    # - {jobs: ['build'], project: 'libcudacxx', std: 'all', cudacxx: 'clang', cxx: 'clang', sm: '90a'}
     # cudax
     - {jobs: ['build'], project: 'cudax', ctk: ['12.0', 'curr'], std: 'all', cxx: ['gcc9', 'gcc10', 'gcc11']}
     - {jobs: ['build'], project: 'cudax', ctk: [        'curr'], std: 'all', cxx: ['clang14', 'clang15', 'clang16', 'clang17']}
@@ -116,7 +114,7 @@ all_stds: [17, 20]
 ctk_versions:
   12.0: { stds: [17, 20] }
   12.5: { stds: [17, 20] }
-  12.6: { stds: [17, 20], aka: 'curr' }
+  12.8: { stds: [17, 20], aka: 'curr' }
 
 device_compilers:
   nvcc: # Version / stds are taken from CTK
@@ -269,7 +267,7 @@ tags:
   # CPU architecture
   cpu: { default: 'amd64' }
   # GPU model
-  gpu: { default: 'v100' }
+  gpu: { default: 'rtx2080' }
   # Host compiler {name, version, exe}
   # See the `host_compilers` map.
   cxx: { default: 'gcc' }
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h
index fef34f25ef4..2e049bc6217 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/nvrtc_workaround.h
@@ -13,6 +13,25 @@
 // required for the PTX tests, we define them here outside the header guards.
 // TODO(bgruber): limit this workaround to NVRTC versions older than the first one shipping those macros
 #ifdef __CUDACC_RTC__
+
+// missing SM_100
+#  define _NV_TARGET_VAL_SM_100             1000
+#  define _NV_TARGET___NV_PROVIDES_SM_100   (_NV_TARGET_PROVIDES(_NV_TARGET_VAL_SM_100))
+#  define _NV_TARGET___NV_IS_EXACTLY_SM_100 (_NV_TARGET_IS_EXACTLY(_NV_TARGET_VAL_SM_100))
+#  define NV_PROVIDES_SM_100                __NV_PROVIDES_SM_100
+#  define NV_IS_EXACTLY_SM_100              __NV_IS_EXACTLY_SM_100
+#  if (_NV_TARGET___NV_IS_EXACTLY_SM_100)
+#    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_100 1
+#  else
+#    define _NV_TARGET_BOOL___NV_IS_EXACTLY_SM_100 0
+#  endif
+#  if (_NV_TARGET___NV_PROVIDES_SM_100)
+#    define _NV_TARGET_BOOL___NV_PROVIDES_SM_100 1
+#  else
+#    define _NV_TARGET_BOOL___NV_PROVIDES_SM_100 0
+#  endif
+
+// missing SM_100a
 #  ifndef NV_HAS_FEATURE_SM_100a
 #    define NV_HAS_FEATURE_SM_100a __NV_HAS_FEATURE_SM_100a
 #    if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) && defined(__CUDA_ARCH_FEAT_SM100_ALL))
@@ -22,7 +41,7 @@
 #    endif
 #  endif // NV_HAS_FEATURE_SM_100a
 
-// Re-enable sm_101a support in nvcc.
+// missing SM_101a
 #  ifndef NV_HAS_FEATURE_SM_101a
 #    define NV_HAS_FEATURE_SM_101a __NV_HAS_FEATURE_SM_101a
 #    if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1010) && defined(__CUDA_ARCH_FEAT_SM101_ALL))
@@ -31,4 +50,5 @@
 #      define _NV_TARGET_BOOL___NV_HAS_FEATURE_SM_101a 0
 #    endif
 #  endif // NV_HAS_FEATURE_SM_101a
+
 #endif // __CUDACC_RTC__
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp
index 212414c4535..bc4e3b1b9d4 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.clusterlaunchcontrol.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/clusterlaunchcontrol.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp
index 951e1a9f513..56c9c9027ec 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.st.bulk.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/st_bulk.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp
index 49f9df928e9..6ba6bb390b9 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.alloc.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_alloc.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp
index 73ea1851bec..1408b505381 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.commit.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_commit.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp
index 85ddc17efe4..133e47b9d03 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.cp.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_cp.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp
index fda57b348de..1f0f830572b 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.fence.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_fence.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp
index 8da8e54f18d..5b407140162 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.ld.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_ld.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp
index 098cbbfa896..082bc0ceb4b 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_mma.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp
index 350c964d749..272710c3601 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.mma.ws.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_mma_ws.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp
index 5ecfff7ff3b..5079b71d6ab 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.shift.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_shift.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp
index 92a49224f0e..fa214acb37e 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.st.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_st.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp
index 4bb3156ed12..ef368bf0a02 100644
--- a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tcgen05.wait.compile.pass.cpp
@@ -14,6 +14,8 @@
 #include <cuda/ptx>
 #include <cuda/std/utility>
 
+#include "nvrtc_workaround.h"
+// above header needs to be included before the generated test header
 #include "generated/tcgen05_wait.h"
 
 int main(int, char**)
diff --git a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp
index c9e544789be..a766e4ad038 100644
--- a/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp
+++ b/libcudacxx/test/libcudacxx/std/utilities/meta/meta.unary/meta.unary.prop/is_constructible.pass.cpp
@@ -146,8 +146,8 @@ template <class T, class A0>
 __host__ __device__ void test_is_not_constructible()
 {
   static_assert((!cuda::std::is_constructible<T, A0>::value), "");
-#if !defined(TEST_COMPILER_MSVC) && !(defined(TEST_COMPILER_CLANG) && __clang_major__ >= 16)
-  // The fallback SFINAE version doesn't work reliable with MSVC, and we don't
+#if !defined(TEST_COMPILER_MSVC) && !defined(TEST_COMPILER_CLANG) && !defined(TEST_COMPILER_NVRTC)
+  // The fallback SFINAE version doesn't work reliable with Clang/MSVC/NVRTC, and we don't
   // use it, so waive it.
   static_assert((!cuda::std::__cccl_is_constructible<T, A0>::type::value), "");
 #endif