InfiniTensor · YdrMaster · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024 · Feb 2, 2024
diff --git a/3rd-party/cccl b/3rd-party/cccl
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -12,12 +12,20 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
+# Download with:
+#
+# mkdir -p cmake
+# wget -O cmake/CPM.cmake https://github.com/cpm-cmake/CPM.cmake/releases/latest/download/get_cpm.cmake
+include(cmake/CPM.cmake)
+
 if(USE_CUDA)
+    CPMAddPackage(NAME CCCL SOURCE_DIR ${CMAKE_SOURCE_DIR}/3rd-party/cccl)
+
     add_compile_definitions(USE_CUDA)
     enable_language(CUDA)
     set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
     if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        set(CMAKE_CUDA_ARCHITECTURES 80)
+        set(CMAKE_CUDA_ARCHITECTURES native)
     endif()
     if(NOT DEFINED CMAKE_CUDA_STANDARD)
         set(CMAKE_CUDA_STANDARD 17)
@@ -45,7 +53,7 @@ endif()
 if (USE_BANG)
     add_compile_definitions(USE_BANG)
     include_directories(src/kernels/mlu/include)
-    
+
     # Neuware Evironment
     if ((NOT DEFINED NEUWARE_HOME) AND (NOT DEFINED ENV{NEUWARE_HOME}))
         message(FATAL_ERROR "NEUWARE_HOME is not defined from cmake or env")
@@ -55,14 +63,14 @@ if (USE_BANG)
         set(NEUWARE_HOME $ENV{NEUWARE_HOME} CACHE STRING "NEUWARE_HOME directory for Cambricon Neuware development")
     endif()
     message(STATUS "NEUWARE_HOME: ${NEUWARE_HOME}")
-    
+
     # cnrt cndrv cnnl
     include_directories("${NEUWARE_HOME}/include")
     find_library(CAMBRICON_CNNL libcnnl.so "${NEUWARE_HOME}/lib64")
     find_library(CAMBRICON_CNRT libcnrt.so "${NEUWARE_HOME}/lib64")
     find_library(CAMBRICON_CNDRV libcndrv.so "${NEUWARE_HOME}/lib64")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lstdc++ -Wall")
-  
+
     if ((NOT DEFINED TARGET_CPU_ARCH) AND (NOT DEFINED ENV{TARGET_CPU_ARCH}))
         execute_process(COMMAND uname -m OUTPUT_VARIABLE _uname_m OUTPUT_STRIP_TRAILING_WHITESPACE)
         set(TARGET_CPU_ARCH "${_uname_m}" CACHE STRING "Target CPU ARCH")

diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: MIT
+#
+# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
+
+set(CPM_DOWNLOAD_VERSION 0.38.7)
+set(CPM_HASH_SUM "83e5eb71b2bbb8b1f2ad38f1950287a057624e385c238f6087f94cdfc44af9c5")
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+file(DOWNLOAD
+     https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+     ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
+)
+
+include(${CPM_DOWNLOAD_LOCATION})
diff --git a/src/02hardware/include/hardware/devices/nvidia.h b/src/02hardware/include/hardware/devices/nvidia.h
@@ -3,6 +3,12 @@
 
 #include "../device.h"
 
+#define CUDA_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != cudaSuccess) {                             \
+        RUNTIME_ERROR(fmt::format("cuda failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cudaGetErrorString(status), (int) status));        \
+    }
+
 namespace refactor::hardware {
 
     class Nvidia final : public Device {

diff --git a/src/02hardware/src/devices/nvidia/device.cc b/src/02hardware/src/devices/nvidia/device.cc
@@ -4,12 +4,6 @@
 #ifdef USE_CUDA
 #include "memory.hh"
 #include <cuda_runtime.h>
-
-#define CUDA_ASSERT(STATUS)                                                          \
-    if (auto status = (STATUS); status != cudaSuccess) {                             \
-        RUNTIME_ERROR(fmt::format("cuda failed on \"" #STATUS "\" with \"{}\" ({})", \
-                                  cudaGetErrorString(status), (int) status));        \
-    }
 #endif
 
 namespace refactor::hardware {

diff --git a/src/02hardware/src/devices/nvidia/memory.cc b/src/02hardware/src/devices/nvidia/memory.cc
@@ -1,15 +1,9 @@
 #ifdef USE_CUDA
 
 #include "memory.hh"
-#include "common.h"
+#include "hardware/devices/nvidia.h"
 #include <cuda_runtime.h>
 
-#define CUDA_ASSERT(STATUS)                                                          \
-    if (auto status = (STATUS); status != cudaSuccess) {                             \
-        RUNTIME_ERROR(fmt::format("cuda failed on \"" #STATUS "\" with \"{}\" ({})", \
-                                  cudaGetErrorString(status), (int) status));        \
-    }
-
 namespace refactor::hardware {
     using M = NvidiaMemory;
 

diff --git a/src/04kernel/CMakeLists.txt b/src/04kernel/CMakeLists.txt
@@ -26,7 +26,8 @@ if(USE_CUDA)
     # nvrtc  for cuda kernel compile
     # cublas for matmul
     # cudnn  for conv and others
-    target_link_libraries(kernel PUBLIC cuda nvrtc cublas cublasLt cudnn kernel_cuda)
+    target_link_libraries(kernel PUBLIC cuda kernel_cuda)
+    target_link_libraries(kernel PRIVATE nvrtc cublas cublasLt cudnn)
     target_include_directories(kernel PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
     list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
     find_package(NCCL REQUIRED)

diff --git a/src/04kernel/cuda/CMakeLists.txt b/src/04kernel/cuda/CMakeLists.txt
@@ -4,7 +4,7 @@ project(kernel_cuda)
 file(GLOB_RECURSE KERNEL_CUDA_SUB_SRC src/*.cu)
 
 add_library(kernel_cuda STATIC ${KERNEL_CUDA_SUB_SRC})
-target_link_libraries(kernel_cuda PUBLIC common)
+target_link_libraries(kernel_cuda PUBLIC common CCCL::CCCL)
 target_include_directories(kernel_cuda PUBLIC include)
 
 file(GLOB_RECURSE KERNEL_CUDA_TEST test/*.cu)

diff --git a/src/04kernel/include/kernel/attributes/attention_info.h b/src/04kernel/include/kernel/attributes/attention_info.h
@@ -0,0 +1,20 @@
+#ifndef KERNEL_ATTENTION_INFO_H
+#define KERNEL_ATTENTION_INFO_H
+
+#include "../tensor.h"
+
+namespace refactor::kernel {
+
+    struct AttentionInfo {
+        DataType dataType;
+        dim_t batch, nHead, nKVHead, seqLen, headDim, cacheLen;
+        bool concatCache, resetCache;
+
+        dim_t attLen(dim_t pastSeqLen) const noexcept;
+        size_t attSize(dim_t pastSeqLen) const noexcept;
+        size_t maxAttSize() const noexcept;
+    };
+
+}// namespace refactor::kernel
+
+#endif// KERNEL_ATTENTION_INFO_H
diff --git a/src/04kernel/include/kernel/collectors/attention.h b/src/04kernel/include/kernel/collectors/attention.h
@@ -6,9 +6,8 @@
 namespace refactor::kernel {
 
     struct AttentionCollector final : public InfoCollector {
-        dim_t maxSeqLen;
 
-        AttentionCollector(decltype(_target), decltype(maxSeqLen)) noexcept;
+        AttentionCollector(decltype(_target)) noexcept;
 
         std::vector<KernelBox>
         filter(TensorRefs inputs, TensorRefs outputs) const final;

diff --git a/src/04kernel/src/attributes/attention_info.cc b/src/04kernel/src/attributes/attention_info.cc
@@ -0,0 +1,17 @@
+#include "kernel/attributes/attention_info.h"
+
+namespace refactor::kernel {
+
+    dim_t AttentionInfo::attLen(dim_t pastSeqLen) const noexcept {
+        return pastSeqLen + seqLen;
+    }
+
+    size_t AttentionInfo::attSize(dim_t pastSeqLen) const noexcept {
+        return batch * nHead * seqLen * attLen(pastSeqLen) * dataType.size();
+    }
+
+    size_t AttentionInfo::maxAttSize() const noexcept {
+        return batch * nHead * seqLen * (cacheLen ? cacheLen : seqLen) * dataType.size();
+    }
+
+}// namespace refactor::kernel
diff --git a/src/04kernel/src/collectors/attention.cc b/src/04kernel/src/collectors/attention.cc
@@ -1,38 +1,57 @@
 #include "kernel/collectors/attention.h"
+#include "kernel/attributes/attention_info.h"
 // #include "../kernels/attention/cpu_kernel.hh"
 #include "../kernels/attention/cuda_kernel.hh"
 
 namespace refactor::kernel {
 
     AttentionCollector::AttentionCollector(
-        decltype(_target) target,
-        decltype(maxSeqLen) maxSeqLen_) noexcept
-        : InfoCollector(target),
-          maxSeqLen(maxSeqLen_) {}
+        decltype(_target) target) noexcept
+        : InfoCollector(target) {}
 
     std::vector<KernelBox>
     AttentionCollector::filter(TensorRefs inputs, TensorRefs outputs) const {
         auto const &query = inputs[0].get();
         auto const &key = inputs[1].get();
-        auto pastSeqLen = inputs.size() == 3 ? 0 : *inputs[2].get().data->get<int64_t>();
-        auto cacheLen = outputs.size() == 1 ? 0 : outputs[1].get().shape[2];
 
-        std::vector<KernelBox> ans;
+        AttentionInfo info{
+            .dataType = query.dataType,
+            .batch = query.shape[0],
+            .nHead = query.shape[1],
+            .nKVHead = key.shape[1],
+            .seqLen = query.shape[2],
+            .headDim = query.shape[3],
+            .cacheLen = 0,
+            .concatCache = false,
+            .resetCache = false,
+        };
+        switch (outputs.size()) {
+            case 1:
+                // no kv cache
+                ASSERT(inputs.size() == 3, "");
+                break;
+            case 3:
+                switch (inputs.size()) {
+                    case 6:
+                        info.resetCache = true;
+                    case 4:
+                        info.concatCache = true;
+                    case 3:
+                        info.cacheLen = outputs[1].get().shape[2];
+                        break;
+                    default:
+                        UNREACHABLE();
+                }
+                break;
+            default:
+                UNREACHABLE();
+        }
+
+        std ::vector<KernelBox> ans;
         switch (_target) {
             case decltype(_target)::Cpu:
                 break;
             case decltype(_target)::Nvidia: {
-                decltype(AttentionCuda::info) info{
-                    .dataType = query.dataType,
-                    .batch = query.shape[0],
-                    .nHead = query.shape[1],
-                    .nKVHead = key.shape[1],
-                    .pastSeqLen = static_cast<dim_t>(pastSeqLen),
-                    .seqLen = query.shape[2],
-                    .cacheLen = cacheLen,
-                    .headDim = query.shape[3],
-                    .resetCache = false,
-                };
                 if (auto ptr = AttentionCuda::build(info); ptr) {
                     ans.emplace_back(std::move(ptr));
                 }
+6 −4		cub/benchmarks/bench/histogram/histogram_common.cuh
+2 −0		cub/cub/agent/agent_reduce.cuh
+1 −0		cub/cub/agent/agent_three_way_partition.cuh
+2 −2		cub/cub/block/block_radix_rank.cuh
+1 −0		cub/cub/block/radix_rank_sort_operations.cuh
+3 −1		cub/cub/detail/type_traits.cuh
+1 −1		cub/cub/detail/uninitialized_copy.cuh
+2 −0		cub/cub/device/device_select.cuh
+2 −0		cub/cub/device/dispatch/dispatch_reduce.cuh
+1 −0		cub/cub/util_device.cuh
+1 −0		cub/cub/util_ptx.cuh
+2 −0		cub/cub/util_type.cuh
+1 −0		libcudacxx/include/cuda/std/detail/libcxx/include/CMakeLists.txt
+141 −0		libcudacxx/include/cuda/std/detail/libcxx/include/__ranges/data.h
+1 −5		libcudacxx/include/cuda/std/detail/libcxx/include/functional
+1 −0		libcudacxx/include/cuda/std/detail/libcxx/include/ranges
+293 −0		libcudacxx/libcxx/test/std/ranges/range.access/data.pass.cpp
+25 −0		libcudacxx/libcxx/test/std/ranges/range.access/data.verify.cpp
+302 −0		libcudacxx/test/libcudacxx/std/ranges/range.access/data.pass.cpp
+33 −0		libcudacxx/test/libcudacxx/std/ranges/range.access/data.verify.cpp
+4 −0		libcudacxx/test/libcudacxx/std/utilities/meta/meta.rel/is_nothrow_invocable.pass.cpp
+5 −0		thrust/thrust/detail/allocator/allocator_traits.h