Backport to 2.8: PTX support for Blackwell (#3624)

* Sync ptx_dot_variants.h with libcuda-ptx (#3564) * Update ptx_isa.h to include 8.6 and 8.7 (#3563) * PTX: Update generated files with Blackwell instructions (#3568) * ptx: Update existing instructions * ptx: Add new instructions * Fix returning error out values See: - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/74 - https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/73 * ptx: Fix out var declaration See https://gitlab-master.nvidia.com/CCCL/libcuda-ptx/-/merge_requests/75 * mbarrier.{test,try}_wait: Fix test. Wrong files were included. * docs: Fix special registers include * Allow non-included documentation pages * Workaround NVRTC Co-authored-by: Allard Hendriksen <[email protected]> * PTX: Remove internal instructions (#3583) * barrier.cluster.aligned: Remove This is not supposed to be exposed in CCCL. * elect.sync: Remove Not ready for inclusion yet. This needs to handle the optional extra output mask as well. * mapa: Remove This has compiler bugs. We should use intrinsics instead. Co-authored-by: Allard Hendriksen <[email protected]> * PTX: Update existing instructions (#3584) * mbarrier.expect_tx: Add missing source and test It was already documented(!) * cp.async.bulk.tensor: Add .{gather,scatter}4 * fence: Add .sync_restrict, .proxy.async.sync_restrict Co-authored-by: Allard Hendriksen <[email protected]> * PTX: Add clusterlaunchcontrol (#3589) Co-authored-by: Allard Hendriksen <[email protected]> * PTX: Add cp.async.mbarrier.arrive{.noinc} (#3602) Co-authored-by: Allard Hendriksen <[email protected]> * PTX: Add multimem instructions (#3603) * Add multimem.ld_reduce * Add multimem.red * Add multimem.st Co-authored-by: Allard Hendriksen <[email protected]> * PTX: Add st.bulk (#3604) Co-authored-by: Allard Hendriksen <[email protected]> * PTX: Add tcgen05 instructions (#3607) * ptx: Add tcgen05.alloc * ptx: Add tcgen05.commit * ptx: Add tcgen05.cp * ptx: Add tcgen05.fence * ptx: Add tcgen05.ld * ptx: Add tcgen05.mma * ptx: Add tcgen05.mma.ws * ptx: Add tcgen05.shift * ptx: Add tcgen05.st * ptx: Add tcgen05.wait * fix docs --------- Co-authored-by: Allard Hendriksen <[email protected]> --------- Co-authored-by: Allard Hendriksen <[email protected]>
NVIDIA · Jan 31, 2025 · fcc5205 · fcc5205
1 parent 496ee97
commit fcc5205
Show file tree

Hide file tree

Showing 205 changed files with 58,974 additions and 2,978 deletions.
diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst
@@ -7,10 +7,12 @@ PTX Instructions
    :maxdepth: 1
 
    instructions/barrier_cluster
+   instructions/clusterlaunchcontrol
    instructions/cp_async_bulk
    instructions/cp_async_bulk_commit_group
    instructions/cp_async_bulk_wait_group
    instructions/cp_async_bulk_tensor
+   instructions/cp_async_mbarrier_arrive
    instructions/cp_reduce_async_bulk
    instructions/cp_reduce_async_bulk_tensor
    instructions/fence
@@ -21,8 +23,22 @@ PTX Instructions
    instructions/mbarrier_expect_tx
    instructions/mbarrier_test_wait
    instructions/mbarrier_try_wait
+   instructions/multimem_ld_reduce
+   instructions/multimem_red
+   instructions/multimem_st
    instructions/red_async
    instructions/st_async
+   instructions/st_bulk
+   instructions/tcgen05_alloc
+   instructions/tcgen05_commit
+   instructions/tcgen05_cp
+   instructions/tcgen05_fence
+   instructions/tcgen05_ld
+   instructions/tcgen05_mma
+   instructions/tcgen05_mma_ws
+   instructions/tcgen05_shift
+   instructions/tcgen05_st
+   instructions/tcgen05_wait
    instructions/tensormap_replace
    instructions/tensormap_cp_fenceproxy
    instructions/special_registers

diff --git a/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/clusterlaunchcontrol.rst
@@ -0,0 +1,11 @@
+.. _libcudacxx-ptx-instructions-clusterlaunchcontrol:
+
+clusterlaunchcontrol
+====================
+
+-  PTX ISA:
+   `clusterlaunchcontrol.try_cancel <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-try-cancel>`__
+-  PTX ISA:
+   `clusterlaunchcontrol.query_cancel <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-clusterlaunchcontrol-query-cancel>`__
+
+.. include:: generated/clusterlaunchcontrol.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst b/docs/libcudacxx/ptx/instructions/cp_async_bulk_tensor.rst
@@ -21,3 +21,8 @@ Multicast
 ---------
 
 .. include:: generated/cp_async_bulk_tensor_multicast.rst
+
+Scatter / Gather
+----------------
+
+.. include:: generated/cp_async_bulk_tensor_gather_scatter.rst
diff --git a/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst b/docs/libcudacxx/ptx/instructions/cp_async_mbarrier_arrive.rst
@@ -0,0 +1,10 @@
+.. _libcudacxx-ptx-instructions-cp-async-mbarrier-arrive:
+
+cp.async.mbarrier.arrive
+========================
+
+-  PTX ISA:
+   `cp.async.mbarrier.arrive <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive>`__
+
+.. include:: generated/cp_async_mbarrier_arrive.rst
+.. include:: generated/cp_async_mbarrier_arrive_noinc.rst
diff --git a/docs/libcudacxx/ptx/instructions/fence.rst b/docs/libcudacxx/ptx/instructions/fence.rst
@@ -13,6 +13,11 @@ fence
 
 .. include:: generated/fence.rst
 
+fence.sync_restrict
+-------------------
+
+.. include:: generated/fence_sync_restrict.rst
+
 fence.mbarrier_init
 -------------------
 
@@ -29,6 +34,11 @@ fence.proxy.async
 
 .. include:: generated/fence_proxy_async.rst
 
+fence.proxy.async.sync_restrict
+-------------------------------
+
+.. include:: generated/fence_proxy_async_generic_sync_restrict.rst
+
 fence.proxy.tensormap
 ---------------------
 

diff --git a/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst b/docs/libcudacxx/ptx/instructions/generated/clusterlaunchcontrol.rst
@@ -0,0 +1,68 @@
+..
+   This file was automatically generated. Do not edit.
+
+clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100
+   template <typename = void>
+   __device__ static inline void clusterlaunchcontrol_try_cancel(
+     void* addr,
+     uint64_t* smem_bar);
+
+clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.multicast::cluster::all.b128 [addr], [smem_bar]; // PTX ISA 86, SM_100a, SM_101a
+   template <typename = void>
+   __device__ static inline void clusterlaunchcontrol_try_cancel_multicast(
+     void* addr,
+     uint64_t* smem_bar);
+
+clusterlaunchcontrol.query_cancel.is_canceled.pred.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 pred_is_canceled, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline bool clusterlaunchcontrol_query_cancel_is_canceled(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid::x.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_x(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid::y.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_y(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid::z.b32.b128 ret_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline B32 clusterlaunchcontrol_query_cancel_get_first_ctaid_z(
+     B128 try_cancel_response);
+
+clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 block_dim, try_cancel_response; // PTX ISA 86, SM_100
+   template <typename B32, enable_if_t<sizeof(B32) == 4, bool> = true, typename B128, enable_if_t<sizeof(B128) == 16, bool> = true>
+   __device__ static inline void clusterlaunchcontrol_query_cancel_get_first_ctaid(
+     B32 (&block_dim)[4],
+     B128 try_cancel_response);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // 1a. unicast PTX ISA 80, SM_90
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .global }
    template <typename = void>
@@ -17,11 +17,27 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes
      const uint32_t& size,
      uint64_t* smem_bar);
 
+cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [smem_bar]; // PTX ISA 86, SM_90
+   // .dst       = { .shared::cta }
+   // .src       = { .global }
+   template <typename = void>
+   __device__ static inline void cp_async_bulk(
+     cuda::ptx::space_shared_t,
+     cuda::ptx::space_global_t,
+     void* dstMem,
+     const void* srcMem,
+     const uint32_t& size,
+     uint64_t* smem_bar);
+
 cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // 2.  PTX ISA 80, SM_90
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [rdsmem_bar]; // PTX ISA 80, SM_90
    // .dst       = { .shared::cluster }
    // .src       = { .shared::cta }
    template <typename = void>
@@ -37,7 +53,7 @@ cp.async.bulk.global.shared::cta.bulk_group
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // 3.  PTX ISA 80, SM_90
+   // cp.async.bulk.dst.src.bulk_group [dstMem], [srcMem], size; // PTX ISA 80, SM_90
    // .dst       = { .global }
    // .src       = { .shared::cta }
    template <typename = void>
@@ -47,3 +63,19 @@ cp.async.bulk.global.shared::cta.bulk_group
      void* dstMem,
      const void* srcMem,
      const uint32_t& size);
+
+cp.async.bulk.global.shared::cta.bulk_group.cp_mask
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code:: cuda
+
+   // cp.async.bulk.dst.src.bulk_group.cp_mask [dstMem], [srcMem], size, byteMask; // PTX ISA 86, SM_100
+   // .dst       = { .global }
+   // .src       = { .shared::cta }
+   template <typename = void>
+   __device__ static inline void cp_async_bulk_cp_mask(
+     cuda::ptx::space_global_t,
+     cuda::ptx::space_shared_t,
+     void* dstMem,
+     const void* srcMem,
+     const uint32_t& size,
+     const uint16_t& byteMask);
diff --git a/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst b/docs/libcudacxx/ptx/instructions/generated/cp_async_bulk_multicast.rst
@@ -5,7 +5,7 @@ cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::clu
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. code:: cuda
 
-   // cp.async.bulk{.dst}{.src}.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // 1.  PTX ISA 80, SM_90a
+   // cp.async.bulk.dst.src.mbarrier::complete_tx::bytes.multicast::cluster [dstMem], [srcMem], size, [smem_bar], ctaMask; // PTX ISA 80, SM_90a, SM_100a, SM_101a
    // .dst       = { .shared::cluster }
    // .src       = { .global }
    template <typename = void>