diff --git a/CMakeLists.txt b/CMakeLists.txt index 1136d0953..16e0f9ae5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,8 @@ endif() find_package(Doxygen QUIET) +################################################################################ + # # CUTLASS 3.x requires C++17 # @@ -80,6 +82,10 @@ endif() message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}") +set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.") +# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive + +################################################################################ set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library") if(CUTLASS_ENABLE_HEADERS_ONLY) @@ -112,6 +118,8 @@ if (CUTLASS_ENABLE_TESTS) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake) endif() +################################################################################ + set(CUTLASS_NVCC_ARCHS_SUPPORTED "") if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang") list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87) @@ -197,15 +205,16 @@ set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.") set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.") set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.") +################################################################################ # # CUTLASS generator cmake configuration # + set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.") -set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.") +set(CUTLASS_LIBRARY_KERNELS ${CUTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.") set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.") -# Test Levels L0, L1, L2 -set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.") +################################################################################ set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests") @@ -225,6 +234,8 @@ if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED) list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1) endif() +################################################################################ + # # CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations. # @@ -650,14 +661,16 @@ function(cutlass_add_executable_tests NAME TARGET) # DEPENDS: A list of targets or files on which this test is dependent. # DEPENDEES: A list of targets which should depend on this test. # TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments -# to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of +# to pass to the test executable. A unique test is generated for each set of # options given. If this option is not used, a single test with no arguments is generated. +# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for +# generating the full variable name to be referenced. # RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed # test results to speed up test runtime. # set(options DISABLE_EXECUTABLE_INSTALL_RULE) - set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE) + set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX) set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS) cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -701,7 +714,6 @@ function(cutlass_add_executable_tests NAME TARGET) endif() list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT) - set(CMD_IDX 0) if (CMD_COUNT GREATER 1) add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS}) @@ -710,12 +722,12 @@ function(cutlass_add_executable_tests NAME TARGET) endforeach() endif() - foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS}) + foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS) if (CMD_COUNT GREATER 1) - set(TEST_NAME ${NAME}_${CMD_IDX}) + string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TEST_NAME) else() - set(TEST_NAME ${NAME}) + string(TOLOWER "${NAME}" TEST_NAME) endif() # The following rigmarole is needed to deal with spaces and possible quotes in @@ -724,14 +736,14 @@ function(cutlass_add_executable_tests NAME TARGET) # preserves any quotes. Note, they have to be in this order for it to work for # all the use cases below. - set(CMD_OPTIONS ${${CMD_OPTIONS}}) - list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS) - separate_arguments(CMD_OPTIONS) - + set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}}) + list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS) + separate_arguments(TEST_COMMAND_OPTIONS) + add_custom_target( ${TEST_NAME} COMMAND - ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ ${CMD_OPTIONS} + ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ ${TEST_COMMAND_OPTIONS} DEPENDS ${TARGET} ) @@ -746,7 +758,7 @@ function(cutlass_add_executable_tests NAME TARGET) add_test( NAME c${TEST_NAME} - COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ ${CMD_OPTIONS} + COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ ${TEST_COMMAND_OPTIONS} ) set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS}) @@ -756,18 +768,21 @@ function(cutlass_add_executable_tests NAME TARGET) # To run the tests from an install package with tests enabled, we need to generate test files # that don't rely on the current directory structure in build. + set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/${NAME}) + file(MAKE_DIRECTORY ${TEST_GEN_DIR}) + set(TEST_NAME c${TEST_NAME}) set(TEST_EXE $) set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR}) - configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY) + configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY) file(GENERATE - OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" - INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" + OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" + INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" ) install( - FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" + FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/ ) @@ -775,8 +790,6 @@ function(cutlass_add_executable_tests NAME TARGET) endif() - math(EXPR CMD_IDX "${CMD_IDX} + 1") - endforeach() endfunction() @@ -787,6 +800,7 @@ if (CUTLASS_ENABLE_TOOLS) add_dependencies(test_all test_profiler) endif() endif() + if (CUTLASS_ENABLE_EXAMPLES) add_subdirectory(examples) add_dependencies(test_all test_examples) diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt index a240bcc97..e9d659e19 100644 --- a/examples/08_turing_tensorop_gemm/CMakeLists.txt +++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt @@ -31,5 +31,6 @@ cutlass_example_add_executable( 08_turing_tensorop_gemm turing_tensorop_gemm.cu + DISABLE_TESTS ON ) diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt index 5d4dac6cf..abe61be1c 100644 --- a/examples/12_gemm_bias_relu/CMakeLists.txt +++ b/examples/12_gemm_bias_relu/CMakeLists.txt @@ -31,5 +31,6 @@ cutlass_example_add_executable( 12_gemm_bias_relu gemm_bias_relu.cu + DISABLE_TESTS ON ) diff --git a/examples/31_basic_syrk/basic_syrk.cu b/examples/31_basic_syrk/basic_syrk.cu index 82f4a6a2f..65ac04808 100644 --- a/examples/31_basic_syrk/basic_syrk.cu +++ b/examples/31_basic_syrk/basic_syrk.cu @@ -34,7 +34,7 @@ matrix multiply kernel to verify its correctness. The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes - the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes + the symmetric rank-k update (SYRK) using double-precision floating-point arithmetic and assumes all matrices have column-major layout. The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices. diff --git a/examples/32_basic_trmm/basic_trmm.cu b/examples/32_basic_trmm/basic_trmm.cu index 74f5cb9f0..084b6563c 100644 --- a/examples/32_basic_trmm/basic_trmm.cu +++ b/examples/32_basic_trmm/basic_trmm.cu @@ -34,7 +34,7 @@ matrix multiply kernel to verify its correctness. The CUTLASS Trmm template is instantiated in the function CutlassStrmmNN. This is kernel computes - the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes + the triangular matrix product (TRMM) using double-precision floating-point arithmetic and assumes all matrices have column-major layout. The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices. diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu index 12739a057..bb995f598 100644 --- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu +++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu @@ -495,7 +495,7 @@ int main(int argc, const char **argv) options.tensor_d.resize(options.problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from CUTLASS kernel options.tensor_ref_d.resize(options.problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from reference kernel - // Fill matrix A on host with uniform-random data [4, -4] + // Fill matrix A on host with uniform-random data [2, -2] cutlass::reference::host::TensorFillRandomUniform( options.tensor_a.host_view(), 1, @@ -503,7 +503,7 @@ int main(int argc, const char **argv) ElementA(-2), 0); - // Fill matrix B on host with uniform-random data [4, -4] + // Fill matrix B on host with uniform-random data [2, -2] cutlass::reference::host::TensorFillRandomUniform( options.tensor_b.host_view(), 1, @@ -511,7 +511,7 @@ int main(int argc, const char **argv) ElementB(-2), 0); - // Fill matrix C on host with uniform-random data [4, -4] + // Fill matrix C on host with uniform-random data [2, -2] cutlass::reference::host::TensorFillRandomUniform( options.tensor_c.host_view(), 1, diff --git a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu index 001e8329d..6bbdfb6a9 100644 --- a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu +++ b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu @@ -84,9 +84,10 @@ therefore letting the builder pick the collective specialization. CUTLASS builders make an attempt to pick the best schedule when `Auto` is provided such that the - assembled collctives have the best performance, but this is not a guarantee. A user relying on `Auto` + assembled collectives have the best performance, but this is not a guarantee. A user relying on `Auto` may get a free performance upgrade with newer CUTLASS releases in case we can provide more optimized - implementations that the builder can transparently assemble for `Auto`. + implementations that the builder can transparently assemble for `Auto`. But a user should not rely on + `Auto` if they require a specific scheduling policy and/or stage count to be used. If a user decides to let the builders pick the collective specialization via `Auto` schedules, they must be used for both mainloop and epilogue alike to ensure compatibility between the @@ -99,11 +100,6 @@ in this manner remains the primary API for using CUTLASS 3 kernels. `CollectiveBuilder`s are simply meant to be a convenience interface. - Note also that, while the selections made by CollectiveBuilder attempt to maximize performance, this is not - a guarantee. Furthermore, the behavior of the CollectiveBuilder when `Auto` parameters are provided is subject - to change in future CUTLASS releases -- do not rely on `Auto` if you require a specific scheduling policy and/or - stage count to be used. - Details of this example ----------------------- This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified. diff --git a/include/cute/arch/cluster_sm90.hpp b/include/cute/arch/cluster_sm90.hpp index e9c858d54..7e909712a 100644 --- a/include/cute/arch/cluster_sm90.hpp +++ b/include/cute/arch/cluster_sm90.hpp @@ -227,4 +227,17 @@ elect_one_leader_sync() #endif } +// Store value to remote shared memory in the cluster +CUTE_DEVICE +void +store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank) +{ +#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED) + uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank); + uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank); + asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];" + : : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr)); +#endif +} + } // end namespace cute diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h index db6c4073e..b44407e53 100644 --- a/include/cutlass/arch/memory.h +++ b/include/cutlass/arch/memory.h @@ -35,6 +35,7 @@ #pragma once #include "cutlass/cutlass.h" +#include "cutlass/arch/cache_operation.h" namespace cutlass { namespace arch { @@ -45,7 +46,9 @@ template < /// Fragment type to store loaded data typename AccessType, /// The bytes of loading - int LoadBytes + int LoadBytes, + /// Cache operation + CacheOperation::Kind cache_op = CacheOperation::Always > struct global_load; @@ -59,7 +62,7 @@ struct global_load; #if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \ (__CUDACC_VER_MAJOR__ > 11)) && \ - defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) + defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) #define CUTLASS_ENABLE_L2_PREFETCH 1 #else #define CUTLASS_ENABLE_L2_PREFETCH 0 @@ -71,7 +74,8 @@ struct global_load; // keep the initializing code before ld.global template struct global_load { CUTLASS_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) { @@ -107,7 +111,40 @@ struct global_load struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint4 *data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %9, 0;\n" + " mov.b32 %0, %10;\n" + " mov.b32 %1, %11;\n" + " mov.b32 %2, %12;\n" + " mov.b32 %3, %13;\n" + " mov.b32 %4, %14;\n" + " mov.b32 %5, %15;\n" + " mov.b32 %6, %16;\n" + " mov.b32 %7, %17;\n" + " @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%8];\n" + " @p ld.global.lu.v4.u32 {%4, %5, %6, %7}, [%18];\n" + "}\n" + : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w), + "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w) + : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y), + "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y), + "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16)); + } +}; + +template +struct global_load { CUTLASS_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) { @@ -133,7 +170,31 @@ struct global_load struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint4 &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " mov.b32 %0, %6;\n" + " mov.b32 %1, %7;\n" + " mov.b32 %2, %8;\n" + " mov.b32 %3, %9;\n" + " @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n" + "}\n" + : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w) + : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w)); + } +}; + +template +struct global_load { CUTLASS_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) { @@ -158,7 +219,30 @@ struct global_load struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint2 &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " mov.b32 %0, %4;\n" + " mov.b32 %1, %5;\n" + " @p ld.global.lu.v2.u32 {%0, %1}, [%2];\n" + "}\n" + : "=r"(data.x), "=r"(data.y) + : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y)); + } +}; + +template +struct global_load { CUTLASS_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) { @@ -182,7 +266,29 @@ struct global_load struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + unsigned &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " mov.b32 %0, %3;\n" + " @p ld.global.lu.u32 %0, [%1];\n" + "}\n" + : "=r"(data) + : "l"(ptr), "r"((int)pred_guard), "r"(data)); + } +}; + +template +struct global_load { CUTLASS_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) { @@ -206,7 +312,29 @@ struct global_load struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint16_t &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " mov.b16 %0, %3;\n" + " @p ld.global.lu.u16 %0, [%1];\n" + "}\n" + : "=h"(data) + : "l"(ptr), "r"((int)pred_guard), "h"(data)); + } +}; + +template +struct global_load { CUTLASS_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) { diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h index 787771478..b74e10388 100644 --- a/include/cutlass/barrier.h +++ b/include/cutlass/barrier.h @@ -81,7 +81,6 @@ struct Barrier CUTLASS_DEVICE static void red_release(int *ptr, int val) { -#if !defined(CUTLASS_PYTHON_HOST_CC) #if (__CUDA_ARCH__ >= 700) /// SM70 and newer use memory consistency qualifiers @@ -94,7 +93,6 @@ struct Barrier __threadfence(); atomicAdd(ptr, val); #endif // (__CUDA_ARCH__ >= 700) -#endif } @@ -104,7 +102,6 @@ struct Barrier CUTLASS_DEVICE static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count) { -#if !defined(CUTLASS_PYTHON_HOST_CC) T *flag_ptr = reinterpret_cast(lock_ptr) + flag_idx; if (thread_idx == 0) @@ -115,14 +112,12 @@ struct Barrier } __syncthreads(); -#endif } /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter CUTLASS_DEVICE static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) { -#if !defined(CUTLASS_PYTHON_HOST_CC) T *flag_ptr = reinterpret_cast(lock_ptr) + flag_idx; if (thread_idx == 0) @@ -132,13 +127,11 @@ struct Barrier while(ld_acquire(flag_ptr) != val) {} } __syncthreads(); -#endif } /// Uses thread[0] to wait for the specified count of signals on the given flag counter CUTLASS_DEVICE static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) { -#if !defined(CUTLASS_PYTHON_HOST_CC) T *flag_ptr = reinterpret_cast(lock_ptr) + flag_idx; if (thread_idx == 0) @@ -149,14 +142,12 @@ struct Barrier } __syncthreads(); -#endif } /// Increment the arrival count for a flag CUTLASS_DEVICE static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx) { -#if !defined(CUTLASS_PYTHON_HOST_CC) T* flag_ptr = reinterpret_cast(lock_ptr) + flag_idx; __syncthreads(); @@ -165,7 +156,6 @@ struct Barrier { red_release(flag_ptr, 1); } -#endif } @@ -173,7 +163,6 @@ struct Barrier CUTLASS_DEVICE static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1) { -#if !defined(CUTLASS_PYTHON_HOST_CC) int flag_idx = first_flag_idx + thread_idx; T* flag_ptr = reinterpret_cast(lock_ptr) + flag_idx; @@ -184,7 +173,6 @@ struct Barrier if (thread_idx < count) { red_release(flag_ptr, 1); } -#endif } }; diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h index c0438a173..c0a968507 100644 --- a/include/cutlass/core_io.h +++ b/include/cutlass/core_io.h @@ -59,9 +59,7 @@ inline std::ostream &operator<<(std::ostream &out, dim3 d) { /// Output operator for CUDA built-in error type inline std::ostream &operator<<(std::ostream &out, cudaError_t error) { -#if !defined(CUTLASS_PYTHON_HOST_CC) return out << cudaGetErrorString(error); -#endif } /////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl index f0d51bb44..b0549d461 100644 --- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl +++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl @@ -290,7 +290,7 @@ struct CollectiveBuilder< AlignmentD, Schedule, cute::enable_if_t || - cute::is_same_v >> { + cute::is_same_v >> { public: // Passing void C disables source load using ElementC = cute::conditional_t, @@ -302,16 +302,33 @@ public: using ThreadOp = thread::LinearCombination< ElementD, AlignmentD, ElementAccumulator, ElementCompute, - thread::ScaleType::Default, FloatRoundStyle::round_to_nearest, ElementC>; + ScaleType, FloatRoundStyle::round_to_nearest, ElementC>; -private: - using Impl = detail::TmaBuilderImpl< - TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute, - ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD, - Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>; + using GmemStrideTypeC = gemm::TagToStrideC_t; + using GmemStrideTypeD = gemm::TagToStrideC_t; -public: - using CollectiveOp = typename Impl::CollectiveOp; + using EpilogueTile_MN = decltype(detail::sm90_compute_tile_shape_or_override< + ElementD, EpilogueTileType, Schedule>()); + + static constexpr int StagesC = 1; + static constexpr int StagesD = 2; + static constexpr bool DisableReuseSmemC = true; + using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue< + cutlass::epilogue::Sm90TmaWarpSpecialized, + TileShape_MNK, + EpilogueTile_MN, + ElementC_, // need to pass void to expose via GemmUniversal + GmemStrideTypeC, + ElementD, + GmemStrideTypeD, + ThreadOp, + SM90_TMA_LOAD, + decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom()), + decltype(detail::sm90_get_smem_load_op_for_source()), + SM90_TMA_STORE, + decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom()), + decltype(detail::sm90_get_smem_store_op_for_accumulator()) + >; }; // Auto builder @@ -409,7 +426,7 @@ struct CollectiveBuilder< AlignmentD, Schedule, cute::enable_if_t || - cute::is_base_of_v >> { + cute::is_base_of_v >> { public: using ThreadOp = thread::LinearCombinationGeneric< @@ -419,10 +436,13 @@ public: Schedule::Round>; private: + static constexpr int StagesC = 1; + static constexpr int StagesD = 2; + static constexpr bool DisableReuseSmemC = true; using Impl = detail::TmaBuilderImpl< TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute, ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD, - Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>; + Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized>; public: using CollectiveOp = typename Impl::CollectiveOp; @@ -459,7 +479,7 @@ struct CollectiveBuilder< AlignmentD, Schedule, cute::enable_if_t || - cute::is_base_of_v >> { + cute::is_base_of_v >> { public: using ThreadOp = thread::LinearCombinationBiasElementwise< @@ -468,10 +488,12 @@ public: Schedule::StoreT, typename Schedule::ElementBias>; private: + static constexpr int StagesC = 1; + static constexpr int StagesD = 2; using Impl = detail::TmaBuilderImpl< TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute, ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD, - Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<1,2>>; + Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise>; public: using CollectiveOp = typename Impl::CollectiveOp; diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp index 5654597e0..bbeb181e1 100644 --- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp @@ -82,7 +82,6 @@ class CollectiveEpilogue< // // Type Aliases // - // derived types of output thread level operator using DispatchPolicy = Sm90TmaWarpSpecialized; using BlockTileShape = BlockTileShape_; using EpilogueTile = EpilogueTile_; @@ -108,7 +107,6 @@ class CollectiveEpilogue< constexpr static bool iskThreadEpilogueOpWithBias = detail::IsThreadEpilogueOpWithBias::value; using AlignmentType = typename uint_bit::value * kOutputAlignment>::type; - static_assert(sizeof(ElementC) == 2, "Only 16b source supported for now"); static_assert(sizeof(ElementD) == 2, "Only 16b output supported for now"); static_assert(!is_layout::value && is_tuple::value, "EpilogueTile must be a cute::Tile or cute::Shape"); static_assert(rank(BlockTileShape{}) == 3, "BlockTileShape must be rank-3: [BLK_M,BLK_N,BLK_K]"); @@ -117,17 +115,19 @@ class CollectiveEpilogue< static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]"); private: + using InternalElementC = std::conditional_t,ElementD,ElementC>; // prevents void ref breakages constexpr static int StagesC = StagesC_; constexpr static int StagesD = StagesD_; constexpr static bool is_source_supported = ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default || ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::NoBetaScaling; + static_assert((std::is_void_v && not is_source_supported) || (not std::is_void_v && is_source_supported)); // internal optimization to reuse C shared memory for storing D - using SmemLayoutAtomBitsC = decltype(downcast::value>(SmemLayoutAtomC{})); + using SmemLayoutAtomBitsC = decltype(downcast::value>(SmemLayoutAtomC{})); using SmemLayoutAtomBitsD = decltype(downcast::value>(SmemLayoutAtomD{})); constexpr static bool ReuseSmemC = not DispatchPolicy::DisableSmemReuseC && is_source_supported && - sizeof(ElementC) == sizeof(ElementD) && + sizeof(InternalElementC) == sizeof(ElementD) && StrideC{} == StrideD{} && cute::is_same_v; @@ -152,7 +152,7 @@ class CollectiveEpilogue< using LoadPipeline = cutlass::PipelineTransactionAsync; using LoadPipelineState = cutlass::PipelineState; constexpr static uint32_t TmaTransactionBytes = - size(take<0,2>(SmemLayoutC{})) * static_cast(sizeof(ElementC)); + size(take<0,2>(SmemLayoutC{})) * static_cast(sizeof(InternalElementC)); // TMA pipeline for storing D using StorePipeline = cutlass::PipelineTmaStore; @@ -161,8 +161,8 @@ class CollectiveEpilogue< struct SharedStorage { struct TensorStorage : aligned_struct<128> { cute::conditional_t, - array_aligned> smem_C; + detail::EmptyStorage, + array_aligned> smem_C; alignas(128) cute::conditional_t, array_aligned> smem_D; @@ -187,7 +187,7 @@ class CollectiveEpilogue< struct Params { using TMA_C = decltype(make_tma_copy( CopyOpG2S{}, - make_tensor(static_cast(nullptr), + make_tensor(static_cast(nullptr), repeat_like(StrideC{}, int32_t(0)), StrideC{}), SmemLayoutC{}(_,_,0))); using TMA_D = decltype(make_tma_copy( @@ -217,7 +217,7 @@ class CollectiveEpilogue< auto M = get<0>(problem_shape_MNKL); auto N = get<1>(problem_shape_MNKL); auto L = get<3>(problem_shape_MNKL); - Tensor tensor_c = make_tensor(args.ptr_C, make_layout(make_shape(M,N,L), args.dC)); + Tensor tensor_c = make_tensor(static_cast(args.ptr_C), make_layout(make_shape(M,N,L), args.dC)); Tensor tensor_d = make_tensor(args.ptr_D, make_layout(make_shape(M,N,L), args.dD)); typename Params::TMA_C tma_load_c = make_tma_copy( CopyOpG2S{}, @@ -409,7 +409,7 @@ class CollectiveEpilogue< // Allocate register tensors auto tRS_rD_shape = take<0,3>(shape(thread_r2s.partition_S(bEsD))); // (R2S,R2S_M,R2S_N) - Tensor tRS_rC = make_tensor(tRS_rD_shape); // (R2S,R2S_M,R2S_N) + Tensor tRS_rC = make_tensor(tRS_rD_shape); // (R2S,R2S_M,R2S_N) Tensor tRS_rD = make_tensor(tRS_rD_shape); // (R2S,R2S_M,R2S_N) // Vectorized fragment view for thread epilogue op @@ -418,7 +418,7 @@ class CollectiveEpilogue< Tensor tRS_rD_frg = recast(tRS_rD); // Partition for smem to register copy (tSR_) - TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom{}, tiled_r2s); + TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom{}, tiled_r2s); ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx); Tensor tSR_sC = thread_s2r.partition_S(bEsC); // (S2R,S2R_M,S2R_N,EPI_M,EPI_N) Tensor tSR_rC = thread_s2r.retile_D(tRS_rC); // (S2R,S2R_M,S2R_N) diff --git a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp index e78eb76c4..c89f28895 100644 --- a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp +++ b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp @@ -130,6 +130,7 @@ class LinearCombinationTensorBroadcast { using ActivationFunctor = ActivationFunctor_; static constexpr int kCount = 1; + static constexpr ScaleType::Kind kScale = Scale; using FragmentOutput = Array; using FragmentAccumulator = Array; diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h index 4cba4a60c..61d961df3 100644 --- a/include/cutlass/epilogue/threadblock/epilogue.h +++ b/include/cutlass/epilogue/threadblock/epilogue.h @@ -323,7 +323,7 @@ class Epilogue : OutputTileIterator destination_iterator, ///< Tile iterator for destination OutputTileIterator source_iterator) ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) { - // Redcuce peer accumulator fragments into one fragment + // Reduce peer accumulator fragments into one fragment AccumulatorFragment accum_fragment; BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace); diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp index a80a6dbde..faf2857ac 100644 --- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp @@ -190,7 +190,8 @@ struct CollectiveMma< "SmemLayoutB K must be 128bytes to be transposed."); static_assert(!transform::collective::detail::use_universal_transposition(), "Warp specialized ARF kernels have not supported universal B transposition yet."); - static_assert(!TransposeB || shape<0>(TileShape{}) == 64, "Optimized transpose RS kernel requires TileShape M = 64."); + static_assert(!TransposeB || !cute::is_same_v, + "Transpose RS kernel requires kernel schedule schmem is not KernelTmaWarpSpecializedCooperative."); struct SharedStorage { @@ -294,7 +295,7 @@ struct CollectiveMma< static constexpr int K_PIPE_MAX = DispatchPolicy::Stages; static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages; - static_assert(K_PIPE_MMAS >= 1, "At least one MMA stage should be asynchronous for this mainloop."); + static_assert(K_PIPE_MMAS == 0, "no MMA stage should be asynchronous for this mainloop for now."); static constexpr uint32_t TmaTransactionBytes = (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast(sizeof(InternalElementA)))+ (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast(sizeof(InternalElementB))); @@ -368,21 +369,6 @@ struct CollectiveMma< } } - // Issue the prologue loads - int k_tile_prologue = min(k_tile_count, K_PIPE_MAX); - CUTLASS_PRAGMA_UNROLL - for (int count = 0; count < k_tile_prologue; ++count) { - pipeline.producer_acquire(smem_pipe_write); - using BarrierType = typename MainloopPipeline::ProducerBarrierType; - BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write); - - int write_stage = smem_pipe_write.index(); - copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); - copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); - ++k_tile_iter; - ++smem_pipe_write; - } - k_tile_count -= k_tile_prologue; // Mainloop CUTLASS_PRAGMA_NO_UNROLL for ( ; k_tile_count > 0; --k_tile_count) { diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp index 01638c528..39c1a17aa 100644 --- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp @@ -303,22 +303,6 @@ struct CollectiveMma< } } - // Issue the prologue loads - int k_tile_prologue = min(k_tile_count, K_PIPE_MAX); - CUTLASS_PRAGMA_UNROLL - for (int count = 0; count < k_tile_prologue; ++count) { - pipeline.producer_acquire(smem_pipe_write); - using BarrierType = typename MainloopPipeline::ProducerBarrierType; - BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write); - - int write_stage = smem_pipe_write.index(); - copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage)); - copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage)); - ++k_tile_iter; - ++smem_pipe_write; - } - k_tile_count -= k_tile_prologue; - // Mainloop CUTLASS_PRAGMA_NO_UNROLL for ( ; k_tile_count > 0; --k_tile_count) diff --git a/include/cutlass/gemm/device/base_grouped.h b/include/cutlass/gemm/device/base_grouped.h index 207266c7c..f3094e90c 100644 --- a/include/cutlass/gemm/device/base_grouped.h +++ b/include/cutlass/gemm/device/base_grouped.h @@ -301,18 +301,19 @@ class BaseGrouped { return 0; } - result = cudaGetDeviceProperties(&properties, device_idx); + int multiprocessor_count; + result = cudaDeviceGetAttribute(&multiprocessor_count, + cudaDevAttrMultiProcessorCount, device_idx); if (result != cudaSuccess) { - // Call cudaGetLastError() to clear the error bit - result = cudaGetLastError(); - CUTLASS_TRACE_HOST(" cudaGetDeviceProperties() returned error " - << cudaGetErrorString(result)); + CUTLASS_TRACE_HOST( + " cudaDeviceGetAttribute() returned error " + << cudaGetErrorString(result)); return 0; } - bool override_sm_count = (available_sm_count < 0 || available_sm_count > properties.multiProcessorCount); + bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count); if (override_sm_count) { - available_sm_count = properties.multiProcessorCount; + available_sm_count = multiprocessor_count; } int max_active_blocks = maximum_active_blocks(); @@ -440,8 +441,6 @@ class BaseGrouped { cudaError_t result = cudaGetLastError(); if (result != cudaSuccess) { - // Call cudaGetLastError() to clear the error bit - result = cudaGetLastError(); CUTLASS_TRACE_HOST(" grid launch failed with error " << cudaGetErrorString(result)); return Status::kErrorInternal; } diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h index 8d193c5be..f4ce58514 100644 --- a/include/cutlass/gemm/device/default_gemm_configuration.h +++ b/include/cutlass/gemm/device/default_gemm_configuration.h @@ -490,9 +490,9 @@ struct DefaultGemmConfiguration; - using WarpShape = GemmShape<64, 64, 64>; - using InstructionShape = GemmShape<16, 8, 16>; + using ThreadblockShape = GemmShape<128, 128, 16>; + using WarpShape = GemmShape<32, 64, 16>; + using InstructionShape = GemmShape<8, 8, 4>; static int const kStages = 3; using EpilogueOutputOp = epilogue::thread::LinearCombination< diff --git a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h index 54b7d61c3..e0940f146 100644 --- a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h +++ b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h @@ -31,7 +31,7 @@ /*! \file \brief Template for a GEMM kernel that can broadcast bias vector in the - epigloue. + epilogue. */ #pragma once diff --git a/include/cutlass/gemm/device/gemv_strided_batched.h b/include/cutlass/gemm/device/gemv_strided_batched.h new file mode 100644 index 000000000..005afda97 --- /dev/null +++ b/include/cutlass/gemm/device/gemv_strided_batched.h @@ -0,0 +1,167 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/arch.h" +#include "cutlass/device_kernel.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" +#include "cutlass/gemm/kernel/gemm_universal.h" + +#include "cutlass/gemm/kernel/default_gemm_universal.h" +#include "cutlass/gemm/device/default_gemm_configuration.h" +#include "cutlass/gemm/device/gemm_universal_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +class GemvStridedBatched { +public: + + using GemvKernel = GemvKernel_; + + using ElementA = typename GemvKernel::ElementA; + using LayoutA = typename GemvKernel::LayoutA; + using ElementB = typename GemvKernel::ElementB; + using ElementC = typename GemvKernel::ElementC; + + using ElementAccumulator = typename GemvKernel::ElementAccumulator; + + using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp; + + static ComplexTransform const kTransformA = GemvKernel::kTransformA; + static ComplexTransform const kTransformB = GemvKernel::kTransformB; + + static int const kThreadCount = GemvKernel::kThreadCount; + static int const mThreadCount = GemvKernel::mThreadCount; + + static int const kStages = GemvKernel::kStages; + + static int const kAlignmentA = GemvKernel::kAlignmentA; + static int const kAlignmentB = GemvKernel::kAlignmentB; + static int const kAlignmentC = GemvKernel::kAlignmentC; + + using Arguments = typename GemvKernel::Arguments; + using Params = typename GemvKernel::Params; + +private: + + Params params_; + +public: + + /// Constructs the Gemv. + GemvStridedBatched() {} + + /// Determines whether the Gemv can execute the given problem. + static Status can_implement(Arguments const& args) { + return GemvKernel::can_implement(args); + } + + /// Gets the workspace size + static size_t get_workspace_size(Arguments const& args) { return 0; } + + /// Initializes Gemv state from arguments. + Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) { + params_ = Params(args); + + if (args.problem_size.column() % GemvKernel::kElementsPerAccess) { + return Status::kErrorMisalignedOperand; + } + + return Status::kSuccess; + } + + /// Lightweight update given a subset of arguments + Status update(Arguments const &args, void *workspace = nullptr) { + return params_.update(args); + } + + /// Runs the kernel using initialized state. + Status run(cudaStream_t stream = nullptr) { + + dim3 grid(1, 1, params_.batch_count % 65536); + dim3 block(kThreadCount, mThreadCount, 1); + + int smem_size = 0; + + // Launch + cutlass::Kernel<<>>(params_); + + // + // Query for errors + // + + cudaError_t result = cudaGetLastError(); + + return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; + } + + /// Runs the kernel using initialized state. + Status operator()(cudaStream_t stream = nullptr) { return run(stream); } + + /// Runs the kernel using initialized state. + Status operator()( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + Status status = initialize(args, workspace, stream); + + if (status == Status::kSuccess) { + status = run(stream); + } + + return status; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp index 8de19d46a..aee918ee2 100644 --- a/include/cutlass/gemm/dispatch_policy.hpp +++ b/include/cutlass/gemm/dispatch_policy.hpp @@ -150,7 +150,7 @@ template< int Stages_, class ClusterShape_ = Shape<_1,_1,_1>, class KernelSchedule = KernelTmaWarpSpecialized, - int PipelineAsyncMmaStages_ = 1 + int PipelineAsyncMmaStages_ = 0 > struct MainloopSm90TmaGmmaRmemAWarpSpecialized { constexpr static int Stages = Stages_; diff --git a/include/cutlass/gemm/kernel/gemv_strided_batched.h b/include/cutlass/gemm/kernel/gemv_strided_batched.h new file mode 100644 index 000000000..ddd42c268 --- /dev/null +++ b/include/cutlass/gemm/kernel/gemv_strided_batched.h @@ -0,0 +1,368 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/fast_math.h" +#include "cutlass/matrix_coord.h" +#include "cutlass/complex.h" +#include "cutlass/tensor_ref.h" + +#include "cutlass/arch/memory.h" +#include "cutlass/arch/cache_operation.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/layout/matrix.h" + +#include "cutlass/numeric_conversion.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementA_, /// matrix + typename LayoutA_, + typename ElementB_, /// vector + typename ElementC_, + typename ElementAccumulator_, + int kElementsPerAccess_, + typename EpilogueOutputOp_ +> +struct GemvStridedBatched { +public: + + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using TensorRefA = TensorRef; + + static_assert(std::is_same::value, + "Only supported for row-major A matrix"); + + using ElementB = ElementB_; + using ElementC = ElementC_; + + using ElementAccumulator = ElementAccumulator_; + using EpilogueOutputOp = EpilogueOutputOp_; + + static ComplexTransform const kTransformA = ComplexTransform::kNone; + static ComplexTransform const kTransformB = ComplexTransform::kNone; + + static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest; + + // number of return elements in a global access + static int const kElementsPerAccess = kElementsPerAccess_; + + using FragmentA = Array; + using FragmentB = Array; + using FragmentCompute = Array; + + // thread block shape (kThreadCount, mThreadCount) + static int const kThreadCount = std::min(static_cast(128 / (kElementsPerAccess * sizeof(ElementA))), 16); + static int const mThreadCount = 128 / kThreadCount; + + // rolling tile shape + static int const kTileA = kThreadCount * kElementsPerAccess; + static int const mTileA = mThreadCount * 8; + + // + // Structures + // + + /// Argument structure + struct Arguments + { + MatrixCoord problem_size; + int32_t batch_count; + typename EpilogueOutputOp::Params output_op; + + TensorRefA ref_A; + + ElementB const *ptr_B; + ElementC const *ptr_C; + ElementC *ptr_D; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; + + // + // Methods + // + + Arguments() : batch_count(0) {} + + Arguments( + MatrixCoord problem_size, + int32_t batch_count, + + typename EpilogueOutputOp::Params output_op, + TensorRefA ref_A, + void const *ptr_B, + void const *ptr_C, + void *ptr_D, + + int64_t batch_stride_A, + int64_t batch_stride_B, + int64_t batch_stride_C, + int64_t batch_stride_D) : problem_size(problem_size), + batch_count(batch_count), + output_op(output_op), + ref_A(ref_A), + ptr_B(static_cast(ptr_B)), + ptr_C(static_cast(ptr_C)), + ptr_D(static_cast(ptr_D)), + + batch_stride_A(batch_stride_A), + batch_stride_B(batch_stride_B), + batch_stride_C(batch_stride_C), + batch_stride_D(batch_stride_D) + { + } + + Arguments( + MatrixCoord problem_size, + typename EpilogueOutputOp::Params output_op, + TensorRefA ref_A, + void const *ptr_B, + void const *ptr_C, + void *ptr_D) : Arguments(problem_size, + 1, + 1, + output_op, + ref_A, + ptr_B, + ptr_C, + ptr_D, + 1, + 1, + 1, + 1) + { + } + + Status update(Arguments const &args) + { + problem_size = args.problem_size; + batch_count = args.batch_count; + output_op = args.output_op; + ref_A = ref_A; + ptr_B = args.ptr_B; + ptr_C = args.ptr_C; + ptr_D = args.ptr_D; + batch_stride_A = args.batch_stride_A; + batch_stride_B = args.batch_stride_B; + batch_stride_C = args.batch_stride_C; + batch_stride_D = args.batch_stride_D; + + return Status::kSuccess; + } + }; + + using Params = Arguments; + + /// Shared memory storage structure + union SharedStorage + { + }; + +public: + // + // Methods + // + + CUTLASS_DEVICE + GemvStridedBatched() {} + + /// Determines whether kernel satisfies alignment + static Status can_implement(cutlass::MatrixCoord const &problem_size) + { + if (problem_size.column() % kElementsPerAccess != 0) + return Status::kErrorMisalignedOperand; + return Status::kSuccess; + } + + static Status can_implement(Arguments const &args) + { + return can_implement(args.problem_size); + } + + /// Executes one GEMV + CUTLASS_DEVICE + void operator()(Params const ¶ms, SharedStorage &shared_storage) + { + // Loop over batch indices + for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z) + { + int k_col_id = threadIdx.x; + int m_row_id = threadIdx.y; + + // problem_size (row = m, column = k) + // matrix A (batch, m, k) + // vector B (batch, 1, k) + // vector C (batch, m, 1) + // vector D (batch, m, 1) + + // move in the batch dimension + ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A; + ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B; + + ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C; + ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D; + + // move in the k dimension + ptr_A += k_col_id * kElementsPerAccess; + ptr_B += k_col_id * kElementsPerAccess; + + // move in the m dimension + ptr_A += m_row_id * params.problem_size.column(); + ptr_C += m_row_id; + ptr_D += m_row_id; + + NumericArrayConverter srcA_converter; + NumericArrayConverter srcB_converter; + + for (; m_row_id < params.problem_size.row(); m_row_id += mTileA) + { + ElementAccumulator accum[mTileA / mThreadCount] = {0.f}; + + FragmentB fragB; + FragmentA fragA[mTileA / mThreadCount]; + + int mElemCountPerTile = min(mTileA / mThreadCount, (params.problem_size.row() - m_row_id - 1) / mThreadCount + 1); + + int kUnroll = 0; + + for (; kUnroll < params.problem_size.column() / kTileA * kTileA; kUnroll += kTileA) + { + for (int m = 0; m < mElemCountPerTile; m++) + { + // fetch from matrix A + arch::global_load(fragA[m], (ptr_A + kUnroll + m * mThreadCount * params.problem_size.column()), true); + } + + // fetch from vector B + arch::global_load(fragB, (ptr_B + kUnroll), true); + + for (int m = 0; m < mElemCountPerTile; m++) + { + FragmentCompute fragB_Compute = srcB_converter(fragB); + FragmentCompute fragA_Compute = srcA_converter(fragA[m]); + + // Math + CUTLASS_PRAGMA_UNROLL + for (int e = 0; e < kElementsPerAccess; e++) + { + accum[m] += fragA_Compute.at(e) * fragB_Compute.at(e); + } + } + } + + // calculate the rest of K elements + // each thread fetch 1 element each time + for (int k = kUnroll + k_col_id; k < params.problem_size.column(); k += kThreadCount) + { + ElementB b = *(ptr_B - k_col_id * kElementsPerAccess + k); + for (int m = 0; m < mElemCountPerTile; m++) + { + ElementA a = *(ptr_A - k_col_id * kElementsPerAccess + k + m * mThreadCount * params.problem_size.column()); + accum[m] += ElementAccumulator(a) * ElementAccumulator(b); + } + } + + EpilogueOutputOp output_op(params.output_op); + typename EpilogueOutputOp::FragmentOutput source_fragment[mTileA / mThreadCount]; + + // prefetch from source matrix C + if (output_op.is_source_needed()) + { + for (int m = 0; m < mElemCountPerTile; m++) + { + source_fragment[m][0] = *(ptr_C + m * mThreadCount); + } + } + + typename EpilogueOutputOp::FragmentAccumulator accum_fragment; + typename EpilogueOutputOp::FragmentOutput output_fragment; + + for (int m = 0; m < mElemCountPerTile; m++) + { + for (int mask = (kThreadCount >> 1); mask > 0; mask >>= 1) + { + accum[m] += __shfl_xor_sync(0xFFFFFFFF, accum[m], mask, 32); + } + + if (k_col_id == 0) + { + accum_fragment[0] = accum[m]; + + if (output_op.is_source_needed()) + { + output_fragment = output_op(accum_fragment, source_fragment[m]); + } + else + { + output_fragment = output_op(accum_fragment); + } + + *(ptr_D + m * mThreadCount) = output_fragment[0]; + } + } + + ptr_A += mTileA * params.problem_size.column(); + ptr_C += mTileA; + ptr_D += mTileA; + } + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp index 768c780c0..69f4bb7b4 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp @@ -166,8 +166,8 @@ class GemmUniversal< CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n"); return implementable; } - static constexpr int tma_alignment_bits = 128; - static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; + constexpr int tma_alignment_bits = 128; + constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; auto M = get<0>(args.problem_shape); auto N = get<1>(args.problem_shape); auto K = get<2>(args.problem_shape); @@ -182,7 +182,17 @@ class GemmUniversal< N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0)); if (!implementable) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); + return implementable; + } + + constexpr bool is_beta_supported = + CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default; + implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr); + if (!implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n"); + return implementable; } + return implementable; } diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp index d6619febc..38db8e0f1 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp @@ -173,8 +173,8 @@ class GemmUniversal< CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n"); return implementable; } - static constexpr int tma_alignment_bits = 128; - static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; + constexpr int tma_alignment_bits = 128; + constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; auto M = get<0>(args.problem_shape); auto N = get<1>(args.problem_shape); auto K = get<2>(args.problem_shape); @@ -189,7 +189,17 @@ class GemmUniversal< N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0)); if (!implementable) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); + return implementable; + } + + constexpr bool is_beta_supported = + CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default; + implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr); + if (!implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n"); + return implementable; } + return implementable; } diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp index 3b8e61e05..fe22b0dfb 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp @@ -196,8 +196,8 @@ class GemmUniversal< CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n"); return implementable; } - static constexpr int tma_alignment_bits = 128; - static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; + constexpr int tma_alignment_bits = 128; + constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; auto M = get<0>(args.problem_shape); auto N = get<1>(args.problem_shape); auto K = get<2>(args.problem_shape); @@ -212,7 +212,17 @@ class GemmUniversal< N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0)); if (!implementable) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); + return implementable; + } + + constexpr bool is_beta_supported = + CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default; + implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr); + if (!implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n"); + return implementable; } + return implementable; } diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp index af619b963..82cdf918d 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp @@ -204,8 +204,8 @@ class GemmUniversal< CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n"); return implementable; } - static constexpr int tma_alignment_bits = 128; - static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; + constexpr int tma_alignment_bits = 128; + constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits::value; auto M = get<0>(args.problem_shape); auto N = get<1>(args.problem_shape); auto K = get<2>(args.problem_shape); @@ -220,7 +220,17 @@ class GemmUniversal< N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0)); if (!implementable) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); + return implementable; + } + + constexpr bool is_beta_supported = + CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default; + implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr); + if (!implementable) { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n"); + return implementable; } + return implementable; } diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp index c1c47020b..910a3f1dd 100644 --- a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp +++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp @@ -163,6 +163,12 @@ class PersistentTileSchedulerSm90 { int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc; int const max_blk_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % size(cluster_shape)); int blk_per_device = min_num_gpc * max_blk_occupancy_per_gpc; + + // The calculation below allows for larger grid size launch for different GPUs. + int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc; + int const max_blk_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % size(cluster_shape)); + blk_per_device += max_blk_occupancy_per_residual_gpc; + blk_per_device = sm_count < blk_per_device ? sm_count : blk_per_device; launch_grid.x = std::min( diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h index a1d522bca..f0468f89f 100644 --- a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h +++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h @@ -630,9 +630,6 @@ struct ThreadblockSwizzleStreamK { } -// Guards needed for PyCUTLASS library generation -#if !defined(CUTLASS_PYTHON_HOST_CC) - // // Device-side interface // @@ -692,7 +689,7 @@ struct ThreadblockSwizzleStreamK { return GemmCoord(m, n, get_batch_idx()); } - /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rastorization) + /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization) CUTLASS_DEVICE GemmCoord get_tile_offset_row_major(int tile_idx) const { @@ -740,7 +737,7 @@ struct ThreadblockSwizzleStreamK { div_mod_sk_iters_per_region(region_idx, iter_in_region, iter); int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region; // number of iterations in the region's big blocks - int normal_block_iters = iter_in_region - big_block_iters; // number of iterations in the region's normal bocks + int normal_block_iters = iter_in_region - big_block_iters; // number of iterations in the region's normal blocks int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region); int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters); @@ -794,8 +791,6 @@ struct ThreadblockSwizzleStreamK { return get_sk_block_idx(iter); } -#endif // !defined(CUTLASS_PYTHON_HOST_CC) - }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp index d90a7f14c..807a13992 100644 --- a/include/cutlass/pipeline/sm90_pipeline.hpp +++ b/include/cutlass/pipeline/sm90_pipeline.hpp @@ -450,7 +450,7 @@ private : CUTLASS_DEVICE void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) { if (barrier_token == BarrierStatus::WaitAgain) { - consumer_wait(stage, phase); + full_barrier_ptr_[stage].wait(phase); } } @@ -654,7 +654,7 @@ public : consumer_release(state.index()); } -protected: +private: FullBarrier *full_barrier_ptr_ = nullptr; EmptyBarrier *empty_barrier_ptr_ = nullptr; Params params_; @@ -976,6 +976,11 @@ private : ++stage_; } + CUTLASS_DEVICE + void advance() { + ++stage_; + } + private: CUTLASS_DEVICE diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h index 27343f9e6..afb11bd28 100644 --- a/include/cutlass/semaphore.h +++ b/include/cutlass/semaphore.h @@ -89,19 +89,16 @@ class Semaphore { /// Waits until the semaphore is equal to the given value CUTLASS_DEVICE void wait(int status = 0) { -#if !defined(CUTLASS_PYTHON_HOST_CC) while( __syncthreads_and(state != status) ) { fetch(); } __syncthreads(); -#endif } /// Updates the lock with the given result CUTLASS_DEVICE void release(int status = 0) { -#if !defined(CUTLASS_PYTHON_HOST_CC) __syncthreads(); if (wait_thread) { @@ -111,7 +108,6 @@ class Semaphore { asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); #endif } -#endif } }; diff --git a/media/docs/cute/01_layout.md b/media/docs/cute/01_layout.md index 7f7c6f20d..c1a25ac1b 100644 --- a/media/docs/cute/01_layout.md +++ b/media/docs/cute/01_layout.md @@ -3,8 +3,8 @@ ## Layout This document describes `Layout`, CuTe's core abstraction. -A `Layout` maps from (a) logical coordinate space(s) -to a physical index space. +A `Layout` maps from a logical coordinate space +to an index space. `Layout`s present a common interface to multidimensional array access that abstracts away the details of how the array's elements are organized in memory. @@ -19,7 +19,11 @@ This can help users do things like partition layouts of data over layouts of thr ## Layouts and Tensors -Any of the `Layout`s discussed in this section can be composed with data -- a pointer or an array -- to create a `Tensor`. The responsibility of the `Layout` is to define valid coordinate space(s) and, therefore, the logical shape of the data and map those into an index space. The index space is precisely the offset that would be used to index into the array of data. +Any of the `Layout`s discussed in this section can be composed with data -- e.g., a pointer or an array -- to create a `Tensor`. +The `Layout`'s logical coordinate space represents the logical "shape" of the data, +e.g., the modes of the `Tensor` and their extents. +The `Layout` maps a logical coordinate into an index, +which is an offset to be used to index into the array of data. For details on `Tensor`, please refer to the [`Tensor` section of the tutorial](./03_tensor.md). @@ -31,31 +35,31 @@ Both `Shape` and `Stride` are `IntTuple` types. ### IntTuple -An `IntTuple` is an integer or a tuple of `IntTuple`s. +An `IntTuple` is defined recursively as either a single integer, or a tuple of `IntTuple`s. This means that `IntTuple`s can be arbitrarily nested. Operations defined on `IntTuple`s include the following. -* `get(IntTuple)`: The `I`th element of the `IntTuple`. Note that `get<0>` is defined for integer `IntTuples`. +* `get(IntTuple)`: The `I`th element of the `IntTuple`. For an `IntTuple` consisting of a single integer, `get<0>` is just that integer. -* `rank(IntTuple)`: The number of elements in an `IntTuple`. An int has rank 1, a tuple has rank `tuple_size`. +* `rank(IntTuple)`: The number of elements in an `IntTuple`. A single integer has rank 1, and a tuple has rank `tuple_size`. -* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc. +* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc. -* `size(IntTuple)`: The product of all elements of the IntTuple. +* `size(IntTuple)`: The product of all elements of the `IntTuple`. -We write `IntTuple`s with parenthesis to denote the hierarchy. E.g. `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s. +We write `IntTuple`s with parenthesis to denote the hierarchy. For example, `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s. ## Layout -A `Layout` is then a pair of `IntTuple`s. The first defines the abstract *shape* of the layout and the second defines the *strides*, which map from coordinates within the shape to the index space. +A `Layout` is then a pair of `IntTuple`s. The first element defines the abstract *shape* of the `Layout`, and the second element defines the *strides*, which map from coordinates within the shape to the index space. -As a pair of `IntTuple`s, we can define many similar operations on `Layout`s including +Since a `Layout` is just a pair of `IntTuple`s, we can define operations on `Layout`s analogous to those defined on `IntTuple`. * `get(Layout)`: The `I`th sub-layout of the `Layout`. * `rank(Layout)`: The number of modes in a `Layout`. -* `depth(Layout)`: The number of hierarchical `Layout`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc. +* `depth(Layout)`: The number of hierarchical `Layout`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc. * `shape(Layout)`: The shape of the `Layout`. @@ -86,7 +90,7 @@ These hierarchical access functions include the following. ### Vector examples -Then, we can define a vector as any `Shape` and `Stride` pair with `rank == 1`. +We define a vector as any `Shape` and `Stride` pair with `rank == 1`. For example, the `Layout` ``` @@ -95,9 +99,9 @@ Stride: (1) ``` defines a contiguous 8-element vector. -Similarly, with a stride of `(2)`, +For a vector with the same Shape but a Stride of `(2)`, the interpretation is that the eight elements -are stored at positions 0, 2, 4, $\dots$. +are stored at positions 0, 2, 4, $\dots$, 14. By the above definition, we *also* interpret @@ -168,9 +172,17 @@ auto layout_2x4 = make_layout(make_shape (2, make_shape (2,2)), make_stride(4, make_stride(2,1))); ``` +The `make_layout` function returns a `Layout`. +It deduces the returned `Layout`'s template arguments from the function's arguments. +Similarly, the `make_shape` and `make_stride` functions +return a `Shape` resp. `Stride`. +CuTe often uses these `make_*` functions, +because constructor template argument deduction (CTAD) +does not work for `cute::tuple` as it works for `std::tuple`. + ## Using a `Layout` -The fundamental use of a `Layout` is to map between logical coordinate space(s) and index space. For example, to print an arbitrary rank-2 layout, we can write the function +The fundamental use of a `Layout` is to map between logical coordinate space(s) and an index space. For example, to print an arbitrary rank-2 layout, we can write the function ```c++ template diff --git a/media/docs/cute/02_layout_operations.md b/media/docs/cute/02_layout_operations.md index d0c7f2ab4..7860cb7db 100644 --- a/media/docs/cute/02_layout_operations.md +++ b/media/docs/cute/02_layout_operations.md @@ -73,6 +73,7 @@ In C++, we identify a Tuple with the `cute::tuple` behaves like `std::tuple`, but it works on device or host, and it imposes restrictions on its template arguments for performance and simplicity. + #### IntTuple CuTe then defines an IntTuple as either an integer, or a Tuple of IntTuple. @@ -136,7 +137,7 @@ This code produces the following text output. ``` `print(layout(1, 1))` prints the mapping of -the logical 2-D coordinate (1,1) to 1-D index, which is 4. +the logical 2-D coordinate (1,1) to the 1-D index, which is 4. You can see that from the table, which shows the left logical index as the "row," and the right logical index as the "column." @@ -302,13 +303,13 @@ Both humans and CuTe compute composition using the following rules. 2. Concatenation: A layout can be expressed as the concatenation of its sublayouts. We denote concatenation with parentheses: $B = (B_0,B_1,...)$. The CuTe function `make_layout`, when given zero or more `Layout`s, concatenates them. -3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B0, B1, ...) = (A \circ B0, A \circ B1, ...)$. +3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B_0, B_1, ...) = (A \circ B_0, A \circ B_1, ...)$. 4. "Base case": For layouts $A = a : b$ and $B = c : d$ with integral shape and stride, $A \circ B = R = c : (b * d)$. 5. By-mode composition: Let $\langle B, C \rangle$ (angle brackets, not parentheses) - denote a tuple of two layouts B and C, not their concatenation. Let A = (A0, A1). - Then, $A \circ \langle B, C \rangle = (A0, A1) \circ \langle B, C \rangle = (A0 \circ B, A1 \circ C)$. + denote a tuple of two layouts B and C, not their concatenation. Let $A = (A_0, A_1)$. + Then, $A \circ \langle B, C \rangle = (A_0, A_1) \circ \langle B, C \rangle = (A_0 \circ B, A_1 \circ C)$. This allows the application of composition independently to sublayouts of $A$. #### Examples: Reshape a vector into a matrix @@ -359,6 +360,55 @@ The resulting layout has shape $(4,5)$, just as before. What are the strides? 5. Result: (4:10, 5:2), which by concatenation is (4,5) : (10,2). +#### Example: Reshape a matrix into another matrix + +The composition $((20,2):(16,4) \circ (4,5):(1,4))$ +expresses reshaping the matrix with layout (20,2):(16:4), +into a 4 x 5 matrix in a column-major way. + +1. By deconcatenation, $(4,5) : (1,4)$ is $(4:1, 5:4)$. + +2. Composition is distributive, so $(20,2):(16,4) \circ (4:1, 5:4)$ is $((20,2):(16,4) \circ 4:1, (20,2):(16,4) \circ 5:4)$. + +3. $(20,2):(16,4) \circ 4:1$ has shape $4$ and stride $16$. (4:1 expresses picking the first 4 consecutive elements of (20,2):(16,4). These elements run down the 0th column (leftmost mode) of the layout, whose stride is 16.) + +4. $(20,2):(16,4) \circ 5:4$ has shape $5$ and stride $64 = 4 \cdot 16$. + +5. Result: $(4:16, 5:64)$, which by concatenation is $(4,5) : (16,64)$. + +We get exactly this result with CuTe +if we use compile-time shapes and strides. +The following C++ code prints `(_4,_5):(_16,_64).` + +```c++ +using namespace cute; +auto a = make_layout(make_shape(Int<20>{}, _2{}), make_stride(_16{}, _4{})); +auto b = make_layout(make_shape( _4{}, _5{}), make_stride( _1{}, _4{})); +auto c = composition(a, b); +printf("\n"); +print(c); +``` + +Results may _look_ different (but are the same mathematically) +if we use run-time integers. +The following C++ code prints `((4,1),(5,1)):((16,4),(64,4)).` + +```c++ +using namespace cute; +auto a = make_layout(make_shape(20, 2), make_stride(16, 4)); +auto b = make_layout(make_shape( 4, 5), make_stride( 1, 4)); +auto c = composition(a, b); +printf("\n"); +print(c); +``` + +((4,1),(5,1)):((16,4),(64,4)) is effectively the same layout +as (4,5) : (16,64), because the 1s in the shape don't affect the layout +(as a mathematical function from one integer to one integer). +CuTe chooses not to simplify layout computations +with run-time values in them as much as it could, +because simplifications involving run-time values have a run-time cost. + ### Product CuTe includes four different kinds of layout products. @@ -428,7 +478,7 @@ results in Shape ((2, 2), (3, 4)) and Stride ((1, 2), (16, 4)). | (1,1) | 3 | 19 | 35 | 7 | 23 | 39 | 11 | 27 | 43 | 15 | 31 | 47 | Note how the tile appears in the leftmost column and is reproduced -in each column in the same order as the matrix-of-tiles. That is, +in each column in the same order as the matrix-of-tiles. That is, the tile can be indexed through the first mode of the result and the matrix-of-tiles can be indexed through the second mode. @@ -456,8 +506,8 @@ Shape ((3, 2), (4, 2)) and Stride ((16, 1), (4, 2)). | (2,1) | 33 | 37 | 41 | 45 | 35 | 39 | 43 | 47 | The tile is now interleaved or "raked" with the other 3x4 matrix-of-tiles -instead of appearing as blocks. Other references call this cyclic -distribution. +instead of appearing as blocks. Other references call this a "cyclic +distribution." This might look familiar if you have ever used ScaLAPACK. It expresses a 2-D block cyclic distribution of a 6 x 8 matrix @@ -542,7 +592,87 @@ CuTe includes 3 different kinds of layout division operations. We will summarize these in the sections that follow. -#### Logical divide : the intuitive tiling +#### Logical divide + +##### Example worked in detail + +This section will work the following logical divide example in detail. + +```c++ +Layout a = make_layout(24, 2); +Layout b = make_layout( 4, 2); +Layout c = logical_divide(a, b); +``` + +Logical divide produces a rank-2 `Layout`, +where mode 0 (the leftmost mode) corresponds to the divisor `b`, +and mode 1 (the rightmost mode) corresponds to the "remainder." +Intuitively, the remainder of 24 divided by 4 is 6, +so we know that mode 1 has 6 elements. +We just don't know its shape yet. + +CuTe defines `logical_divide(a, b)` as +`composition(a, make_layout(b, complement(b, size(a))))`. +Here, `size(a)` is 24. +What is `complement(b, 24)`? +Intuitively, it means "the remainder," +what's left over after applying `b` to 0, 1, 2, $\dots$, 23. + +The layout 4:2 means "take 4 elements at even-numbered indices." +The following table overlays the range of 4:2 +atop the complement's codomain 0, 1, $\dots$, 23. + +| Range of 4:2 | 0 | | 2 | | 4 | | 6 | | | | | | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Codomain | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | $\dots$ | 23 | + +Layouts are linear, so their range must include zero. +The complement of 4:2 with respect to 24 is thus a layout whose range + +* includes zero; + +* does not include any other elements of the range of 4:2 + (i.e., satisfies the disjoint property; see above); and + +* includes as much of 0, 1, $\dots$, 23 as possible + (so that it forms the "remainder" of 4:2 with respect to 24). + +Intuitively, the range of the complement must look like this: +0, 1, 8, 9, 16, 17. +The resulting layout is ordered. +It has size 6 and cosize 18, +so it satisfies the bounded property (see above). +This is the layout (2, 3) : (1, 8). +(Going from this intuitive sense of the complement +to knowing how to compute it directly +is out of scope for this part of the tutorial.) + +The following table shows 4:2 with its complement (2, 3) : (1, 8). + +| Range of 4:2 | 0 | | 2 | | 4 | | 6 | | | | | | | | | | | | | | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Codomain | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | $\dots$ | 23 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Range of complement | 0 | 1 | | | | | | | 8 | 9 | | | | | | | 16 | 17 | | | + +Now we know that `logical_divide`(24:2, 4:2) is +`composition`(24:2, `make_layout`(4:2, (2,3):(1,8))). +The composition of two layouts has the shape of the second (rightmost) layout, +so the resulting shape is (4, (2, 3)). +We see that the leftmost mode 4 corresponds to the divisor 4:2, +and the rightmost mode (2, 3) describes what's "left over" +from the original shape 24. + +What are the strides? +We can start from the leftmost mode. +4:2 takes every other element (the even-numbered elements) of 24:2. +That's a stride-2 thing, striding over a stride-2 thing. +The resulting stride is 4. +Similarly, the stride 2 of 24:2 +doubles the two strides of the rightmost mode. +The resulting layout is (4, (2, 3)) : (4, (2, 16)). + +##### Tiling example Suppose I have the 6 x 8 matrix from the Raked Product section and want to "collect" the `tile`, turning the Raked Product into @@ -607,7 +737,7 @@ Note that this is the same layout as the result in the Logical Product section. That is, the first mode is our original tile (and can be interpreted as a 2x2 matrix itself) and the second mode is its logical layout within the raked layout. -##### More Examples of Divide +#### More Examples of Divide For brevity, shapes can be used with `logical_divide` and `tiled_divide` to quickly split and tile modes of a tensor. For example, this C++ code diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py index c5b1caea3..3b9ae1a6e 100644 --- a/python/cutlass/backend/compiler.py +++ b/python/cutlass/backend/compiler.py @@ -200,7 +200,7 @@ def load_operation(self, op_key, extra_funcs): self.compiled_cache_host.insert(key, compiled_host_fns) return True - def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation): + def emit_compile_(self, operation_list, compilation_options): """ Compile a list of kernels and store them into database """ @@ -306,41 +306,17 @@ def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostl cubin_image = file.read() # Set up the host-side library code - if requires_nvcc_hostlib_compilation: - cmd_template = ( - "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}" - % source_buffer_host - ) - cmd = SubstituteTemplate( - cmd_template, - { - "cuda_install_path": CUDA_INSTALL_PATH, - "options": compilation_options.get_str(), - }, - ) - else: - options = compilation_options.get() - cmd = ( - "echo '%s'|g++ -x c++ -fpermissive -w -fPIC -DCUTLASS_PYTHON_HOST_CC=1" - % source_buffer_host - ) - filtered_opts = [ - "-default-device", - "-Xcicc", - "-Xllc", - "--expt-relaxed-constexpr", - "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored", - ] - for opt in options: - opt = opt.decode("utf-8") - if opt not in filtered_opts and "-arch=sm_" not in opt: - if "--include-path=" in opt: - cmd += " " + opt.replace( - "--include-path=", - "-I", - ) - else: - cmd += " " + opt + cmd_template = ( + "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}" + % source_buffer_host + ) + cmd = SubstituteTemplate( + cmd_template, + { + "cuda_install_path": CUDA_INSTALL_PATH, + "options": compilation_options.get_str(), + }, + ) tempfile.tempdir = "./" temp = tempfile.NamedTemporaryFile( @@ -375,7 +351,6 @@ def add_module(self, operations, compile_options=None): # save the cubin operation_key = [] operation_list = [] - requires_nvcc_hostlib_compilation = False for operation in operations: # step 1: get kernel string as key key = operation.rt_module.emit() + operation.procedural_name() + self.backend @@ -398,17 +373,9 @@ def add_module(self, operations, compile_options=None): operation_list.append(operation.rt_module) operation_key.append(key) - # Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate - # the PyCUTLASS host-side library. Otherwise, g++ will be used. - if isinstance(operation, GemmOperationUniversal) and operation.api == ApiVersion.v3x: - if self.backend == "nvrtc": - raise RuntimeError("CUTLASS 3 kernels currently require NVCC for compilation.") - - requires_nvcc_hostlib_compilation = True - if len(operation_list) > 0: cubin_image, host_lib, host_file = self.emit_compile_( - operation_list, compile_options, requires_nvcc_hostlib_compilation) + operation_list, compile_options) err, module = cuda.cuModuleLoadData(cubin_image) if err != cuda.CUresult.CUDA_SUCCESS: diff --git a/python/cutlass/cpp/include/tensor_ref_view.h b/python/cutlass/cpp/include/tensor_ref_view.h index f99d63391..48872e61e 100644 --- a/python/cutlass/cpp/include/tensor_ref_view.h +++ b/python/cutlass/cpp/include/tensor_ref_view.h @@ -43,10 +43,10 @@ template void bind_tensor_ref_view(py::module &m, std::string name) { py::class_>(m, ("TensorRef" + name).c_str()) - .def("__init__", [](cutlass::TensorRef& tensor_ref, int64_t address, const L& layout_ ) { + .def(py::init([](int64_t address, const L& layout_ ) { T* ptr = reinterpret_cast< T*>(address); - new (&tensor_ref) cutlass::TensorRef(ptr, layout_); - }) + return new cutlass::TensorRef(ptr, layout_); + })) .def("data", [](cutlass::TensorRef& tensor_ref) { T* ptr = tensor_ref.data(); return int64_t(ptr); diff --git a/python/setup.py b/python/setup.py index 4c97819ae..a87c82f44 100644 --- a/python/setup.py +++ b/python/setup.py @@ -29,9 +29,12 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ################################################################################################# - +import copy import os +from pybind11.setup_helpers import Pybind11Extension +import setuptools from setuptools import setup +from setuptools.command.build_ext import build_ext def _cutlass_path_from_dir() -> str: @@ -61,31 +64,57 @@ def _cuda_install_path_from_nvcc() -> str: else _cutlass_path_from_dir() ) + cuda_install_path = ( os.getenv('CUDA_INSTALL_PATH') if os.getenv('CUDA_INSTALL_PATH') is not None else _cuda_install_path_from_nvcc() ) -ext_modules = [] -try: - from pybind11.setup_helpers import Pybind11Extension, build_ext - include_dirs = [ - cutlass_path + '/include', - cuda_install_path + '/include', - cutlass_path + '/tools/util/include', - cutlass_path + '/test', - ] +class BuildExtension(build_ext): + """ + Wrapper around `build_ext` to use NVCC when compiling the CUTLASS Python-C++ bindings. + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def build_extensions(self): + original_compile = self.compiler._compile + + def custom_compile(obj, src, ext, cc_args, extra_postargs, pp_opts): + """ + Wrapper around build_ext.compiler._compile method + """ + postargs = copy.deepcopy(extra_postargs) + postargs = [f for f in postargs if f not in ['-g0', '-fvisibility=hidden']] + postargs.extend(["-Xcompiler='-fPIC'", "-Xcompiler='-g0'", "-Xcompiler='-O3'", '-x', 'cu']) + try: + original_compiler = self.compiler.compiler_so + self.compiler.set_executable('compiler_so', [f'{cuda_install_path}/bin/nvcc']) + original_compile(obj, src, ext, cc_args, postargs, pp_opts) + finally: + self.compiler.set_executable('compiler_so', original_compiler) + + self.compiler._compile = custom_compile + super().build_extensions() + + +include_dirs = [ + cutlass_path + '/include', + cuda_install_path + '/include', + cutlass_path + '/tools/util/include', + cutlass_path + '/test', +] + - ext_modules = [ - Pybind11Extension('cutlass_bindings', - ['cutlass/cpp/cutlass_bindings.cpp'], - include_dirs=include_dirs, - extra_compile_args=['-fpermissive', '-w', '-std=c++17', '-DCUTLASS_PYTHON_HOST_CC=1']) - ] -except ImportError: - pass +ext_modules = [ + Pybind11Extension('cutlass_bindings', + ['cutlass/cpp/cutlass_bindings.cpp'], + include_dirs=include_dirs, + extra_compile_args=['-Xcompiler="-fpermissive"', '-w', '-std=c++17'], + libraries=['cudart']) +] setup( @@ -103,4 +132,7 @@ def _cuda_install_path_from_nvcc() -> str: 'treelib' ], ext_modules=ext_modules, + cmdclass={ + 'build_ext': BuildExtension + } ) diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 717dbd5bd..ee13ee497 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -41,6 +41,7 @@ add_custom_target( cutlass_test_unit_gemm_device_tensorop_planar_complex cutlass_test_unit_gemm_device_sparse_tensorop_sm80 cutlass_test_unit_gemv_device + cutlass_test_unit_gemv_device_strided_batched cutlass_test_unit_gemm_device_tensorop_sm90 cutlass_test_unit_gemm_device_tensorop_cluster_multicast_sm90 ) @@ -60,6 +61,7 @@ add_custom_target( test_unit_gemm_device_tensorop_planar_complex test_unit_gemm_device_sparse_tensorop_sm80 test_unit_gemv_device + test_unit_gemv_device_strided_batched test_unit_gemm_device_tensorop_sm90 ) @@ -498,6 +500,15 @@ cutlass_test_unit_add_executable( gemv.cu ) +cutlass_test_unit_add_executable( + cutlass_test_unit_gemv_device_strided_batched + + BATCH_SOURCES ON + BATCH_SIZE 4 + + gemv_strided_batched.cu +) + if (NOT CUDA_COMPILER MATCHES "[Cc]lang") add_dependencies( diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp index 456d73605..5f19032ef 100644 --- a/test/unit/gemm/device/gemm_testbed_3x.hpp +++ b/test/unit/gemm/device/gemm_testbed_3x.hpp @@ -77,7 +77,8 @@ struct TestbedImpl { using StrideA = typename Gemm::GemmKernel::StrideA; using ElementB = typename Gemm::GemmKernel::ElementB; using StrideB = typename Gemm::GemmKernel::StrideB; - using ElementC = typename Gemm::GemmKernel::ElementC; + using ElementC = std::conditional_t, + typename Gemm::GemmKernel::ElementD,typename Gemm::GemmKernel::ElementC>; using StrideC = typename Gemm::GemmKernel::StrideC; using ElementD = typename Gemm::GemmKernel::ElementD; using StrideD = typename Gemm::GemmKernel::StrideD; diff --git a/test/unit/gemm/device/gemv_strided_batched.cu b/test/unit/gemm/device/gemv_strided_batched.cu new file mode 100644 index 000000000..7f526ac04 --- /dev/null +++ b/test/unit/gemm/device/gemv_strided_batched.cu @@ -0,0 +1,490 @@ +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Tests for device-wide strided batched GEMV interface +*/ + + +#include +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/kernel/gemv_strided_batched.h" +#include "cutlass/gemm/device/gemv_strided_batched.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/gemm_complex.h" + +#include "testbed_utils.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace test { +namespace gemm { + +template +class TestbedStridedBatchedGemv +{ +public: + + using ElementA = typename GemvStridedBatched::ElementA; + using LayoutA = typename GemvStridedBatched::LayoutA; + using ElementB = typename GemvStridedBatched::ElementB; + using ElementC = typename GemvStridedBatched::ElementC; + + using ElementAccumulator = typename GemvStridedBatched::ElementAccumulator; + using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute; + + using LayoutV = cutlass::layout::RowMajor; + +private: + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + cutlass::HostTensor tensor_A; + cutlass::HostTensor tensor_B; + cutlass::HostTensor tensor_C; + cutlass::HostTensor tensor_D; + cutlass::HostTensor reference_D; + +public: + + // + // Methods + // + + TestbedStridedBatchedGemv( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2023): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {} + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + double scope_max, scope_min; + int bits_input = cutlass::sizeof_bits::value; + int bits_output = cutlass::sizeof_bits::value; + + if (bits_input == 1) { + scope_max = 2; + scope_min = 0; + } else if (bits_input <= 8) { + scope_max = 2; + scope_min = -2; + } else if (bits_output == 16) { + scope_max = 5; + scope_min = -5; + } else { + scope_max = 8; + scope_min = -8; + } + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, scope_max, scope_min, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + EXPECT_TRUE(false) << "Not implemented"; + return false; + } + + return true; + } + + /// Initializes data structures + void initialize( + cutlass::MatrixCoord problem_size, + int32_t batch_count + ) { + + // + // Allocate the GEMV workspace + // + + tensor_A.resize({batch_count * problem_size.row(), problem_size.column()}); + tensor_B.resize({batch_count * problem_size.column(), 1}); + tensor_C.resize({batch_count * problem_size.row(), 1}); + tensor_D.resize({batch_count * problem_size.row(), 1}); + reference_D.resize({batch_count * problem_size.row(), 1}, false); + + EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 1)); + EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2)); + EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 3)); + + // It is possible to randomly initialize to all zeros, so override this with non-zeros + // in the upper left corner of each operand. + tensor_A.host_view().at({0, 0}) = typename GemvStridedBatched::ElementA(1); + tensor_B.host_view().at({0, 0}) = typename GemvStridedBatched::ElementB(1); + tensor_C.host_view().at({0, 0}) = typename GemvStridedBatched::ElementC(1); + + cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view()); + + tensor_A.sync_device(); + tensor_B.sync_device(); + tensor_C.sync_device(); + tensor_D.sync_device(); + } + + /// Compares computed reference with device reference and outputs to a file if incorrect + bool compare_reference( + cutlass::MatrixCoord problem_size, + ElementCompute alpha, + ElementCompute beta) { + + tensor_D.sync_host(); + + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0); + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0); + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0); + + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0); + EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view()); + + EXPECT_TRUE(passed) << " mismatched reference"; + + if (!passed) { + + std::ofstream file("testbed_universal_errors.txt"); + + file + << "problem: " << problem_size + << ", alpha: " << alpha << ", beta: " << beta << "\n\n"; + + file + << "A =\n" << tensor_A.host_view() + << "\nB =\n" << tensor_B.host_view() + << "\nC =\n" << tensor_C.host_view() + << "\n\nReference =\n" << reference_D.host_view() + << "\nComputed =\n" << tensor_D.host_view(); + } + + return passed; + } + + /// Verifies the result + bool verify( + cutlass::MatrixCoord problem_size, + int32_t batch_count, + int64_t batch_stride_A, + int64_t batch_stride_B, + int64_t batch_stride_C, + int64_t batch_stride_D, + ElementCompute alpha, + ElementCompute beta) { + + // + // Verify + // + + cutlass::reference::host::GemmComplex< + typename GemvStridedBatched::ElementA, typename GemvStridedBatched::LayoutA, + typename GemvStridedBatched::ElementB, LayoutV, + typename GemvStridedBatched::ElementC, LayoutV, + ElementCompute, ElementAccumulator + >( + {problem_size.row(), 1, problem_size.column()}, + alpha, + tensor_A.host_ref(), + GemvStridedBatched::kTransformA, + tensor_B.host_ref(), + GemvStridedBatched::kTransformB, + beta, + tensor_C.host_ref(), + reference_D.host_ref(), + ElementAccumulator(0), + batch_count, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + ); + + return compare_reference(problem_size, alpha, beta); + } + + /// Runs one problem size + bool run( + cutlass::MatrixCoord problem_size, + int32_t batch_count, + int64_t batch_stride_A, + int64_t batch_stride_B, + int64_t batch_stride_C, + int64_t batch_stride_D, + ElementCompute alpha, + ElementCompute beta) { + + this->initialize(problem_size, batch_count); + + // + // Initialize the GEMV operator + // + + typename GemvStridedBatched::Arguments arguments{ + problem_size, + batch_count, + {alpha, beta}, + tensor_A.device_ref(), + tensor_B.device_data(), + tensor_C.device_data(), + tensor_D.device_data(), + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + }; + + GemvStridedBatched gemm_op; + + cutlass::Status status = gemm_op.can_implement(arguments); + + EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status); + + size_t workspace_size = GemvStridedBatched::get_workspace_size(arguments); + + cutlass::device_memory::allocation workspace(workspace_size); + + status = gemm_op.initialize(arguments, workspace.get()); + + EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status); + + // + // Run the GEMV + // + + status = gemm_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status); + + // + // Verify + // + + bool passed = this->verify( + problem_size, + batch_count, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D, + alpha, + beta); + return passed; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +bool TestAllGemv() { + + using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute; + + int Batch[] = { + 1, 520, 1314 + }; + + int M[] = { + 1, 5, 16 + }; + + int K[] = { + 8, 128, 256 + }; + + double Alpha[] = { + 1, 1.25 + }; + + double Beta[] = { + 0, 1, 1.25 + }; + + for (int b : Batch) { + for (int m : M) { + for (int k : K) { + for (double alpha : Alpha) { + for (double beta : Beta) { + + TestbedStridedBatchedGemv testbed; + + if (!testbed.run( + {m, k}, + b, + m * k, + k, + m, + m, + ElementCompute(alpha), + ElementCompute(beta))) { + return false; + } + } + } + } + } + } + + return true; +} + +} // namespace gemm +} // namespace test + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_Device_StridedBatchedGemv_f16n_f16_f16_simt_f32, Simple) { + + using ElementInput = cutlass::half_t; + using ElementOutput = cutlass::half_t; + using LayoutA = cutlass::layout::RowMajor; + + using ElementAccumulator = float; + int const kElementsPerAccess = 8; + + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator>; + + using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched< + cutlass::gemm::kernel::GemvStridedBatched< + ElementInput, // Element A + LayoutA, // Layout A + ElementInput, // Element B + ElementOutput, // Element C + ElementAccumulator, // Element accumulator + kElementsPerAccess, // Element access granularity + EpilogueOp // Output operator + >>; + + EXPECT_TRUE(test::gemm::TestAllGemv()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_Device_StridedBatchedGemv_f32n_f32_f32_simt_f32, Simple) { + + using ElementInput = float; + using ElementOutput = float; + using LayoutA = cutlass::layout::RowMajor; + + using ElementAccumulator = float; + int const kElementsPerAccess = 4; + + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator>; + + using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched< + cutlass::gemm::kernel::GemvStridedBatched< + ElementInput, // Element A + LayoutA, // Layout A + ElementInput, // Element B + ElementOutput, // Element C + ElementAccumulator, // Element accumulator + kElementsPerAccess, // Element access granularity + EpilogueOp // Output operator + >>; + + EXPECT_TRUE(test::gemm::TestAllGemv());} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM50_Device_StridedBatchedGemv_f64n_f64_f64_simt_f64, Simple) { + + using ElementInput = double; + using ElementOutput = double; + using LayoutA = cutlass::layout::RowMajor; + + using ElementAccumulator = double; + int const kElementsPerAccess = 2; + + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator>; + + using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched< + cutlass::gemm::kernel::GemvStridedBatched< + ElementInput, // Element A + LayoutA, // Layout A + ElementInput, // Element B + ElementOutput, // Element C + ElementAccumulator, // Element accumulator + kElementsPerAccess, // Element access granularity + EpilogueOp // Output operator + >>; + + EXPECT_TRUE(test::gemm::TestAllGemv());} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/library/scripts/gemm_operation.py b/tools/library/scripts/gemm_operation.py index cb1075bb4..0be042f9b 100644 --- a/tools/library/scripts/gemm_operation.py +++ b/tools/library/scripts/gemm_operation.py @@ -182,9 +182,9 @@ def procedural_name(self): ar = self.arch, op = opcode_class_name, ex = self.extended_name_3x(), - tbm = self.tile_description.threadblock_shape[0], - tbn = self.tile_description.threadblock_shape[1], - tbk = self.tile_description.threadblock_shape[2], + tbm = self.tile_description.tile_shape[0], + tbn = self.tile_description.tile_shape[1], + tbk = self.tile_description.tile_shape[2], cm = self.tile_description.cluster_shape[0], cn = self.tile_description.cluster_shape[1], ck = self.tile_description.cluster_shape[2], @@ -640,7 +640,7 @@ def __init__(self, operation_suffix = ''): using ${operation_name}_epilogue = typename cutlass::epilogue::collective::CollectiveBuilder< ${arch}, ${opcode_class}, - cute::Shape, + cute::Shape, cute::Shape, cutlass::epilogue::collective::EpilogueTileAuto, ${element_accumulator}, ${element_epilogue}, @@ -655,7 +655,7 @@ def __init__(self, operation_suffix = ''): ${element_a}, ${layout_a}, ${align_a}, ${element_b}, ${layout_b}, ${align_b}, ${element_accumulator}, - cute::Shape, + cute::Shape, cute::Shape, cutlass::gemm::collective::StageCountAutoCarveout< sizeof(typename ${operation_name}_epilogue::SharedStorage)>, @@ -686,14 +686,14 @@ def instance_template(self): # def emit(self, operation): - threadblock_shape = operation.tile_description.threadblock_shape + tile_shape = operation.tile_description.tile_shape warp_count = operation.tile_description.warp_count # stage count set to zero indicates builder automatic stage selection if operation.tile_description.stages > 0: stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>" else: stage_count_string = "cutlass::gemm::collective::StageCountAuto" - warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)] + warp_shape = [tile_shape[idx] // warp_count[idx] for idx in range(3)] instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \ (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout) @@ -727,9 +727,9 @@ def emit(self, operation): 'element_accumulator': DataTypeTag[operation.accumulator_type()], 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], 'arch': "cutlass::arch::Sm%d" % operation.arch, - 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), - 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), - 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'tile_shape_m': str(operation.tile_description.tile_shape[0]), + 'tile_shape_n': str(operation.tile_description.tile_shape[1]), + 'tile_shape_k': str(operation.tile_description.tile_shape[2]), 'cluster_m': str(operation.tile_description.cluster_shape[0]), 'cluster_n': str(operation.tile_description.cluster_shape[1]), 'cluster_k': str(operation.tile_description.cluster_shape[2]), diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py index ec15a1af0..95595af4e 100644 --- a/tools/library/scripts/generator.py +++ b/tools/library/scripts/generator.py @@ -91,22 +91,21 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ # Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts def CreateGemmUniversal3xOperator( - manifest, layouts, tile_descriptions, data_type, + manifest, layouts, tile_descriptions, data_types, schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]], complex_transforms=None, epilogue_functor=EpilogueFunctor.LinearCombination, swizzling_functor=SwizzlingFunctor.Identity1): + if type(data_types) is dict: + data_types = [data_types] + + for s in schedules: + assert(len(s) == 2) + if complex_transforms is None: complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ] - element_a = data_type["a_type"] - element_b = data_type["b_type"] - element_c = data_type["c_type"] - element_d = data_type["d_type"] - element_acc = data_type["acc_type"] - element_epilogue = data_type.get("epi_type", element_acc) - operations = [] # by default, only generate the largest tile and largest alignment @@ -115,23 +114,25 @@ def CreateGemmUniversal3xOperator( for layout in layouts: for tile_description in tile_descriptions: - for complex_transform in complex_transforms: - for kernel_schedule, epilogue_schedule in schedules: - A = TensorDescription( - element_a, layout[0][0], layout[0][1], complex_transform[0]) - B = TensorDescription( - element_b, layout[1][0], layout[1][1], complex_transform[1]) + for data_type in data_types: + for complex_transform in complex_transforms: + for kernel_schedule, epilogue_schedule in schedules: + A = TensorDescription( + data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0]) + B = TensorDescription( + data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1]) - C = TensorDescription(element_c, layout[2][0], layout[2][1]) - D = TensorDescription(element_d, layout[2][0], layout[2][1]) + C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1]) + D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1]) - operation = GemmOperation( - GemmKind.Universal3x, tile_description.minimum_compute_capability, - tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, D, - kernel_schedule, epilogue_schedule) + element_compute = data_type.get("epi_type", data_type["acc_type"]) + operation = GemmOperation( + GemmKind.Universal3x, tile_description.minimum_compute_capability, + tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D, + kernel_schedule, epilogue_schedule) - manifest.append(operation) - operations.append(operation) + manifest.append(operation) + operations.append(operation) return operations @@ -4118,21 +4119,19 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version): layout[2][1] = 8 if CudaToolkitVersionSatisfies(cuda_version, 12, 1): - kernel_schedules = [ - KernelScheduleType.ScheduleAuto, - KernelScheduleType.TmaWarpSpecializedCooperative, - KernelScheduleType.TmaWarpSpecializedPingpong, - KernelScheduleType.TmaWarpSpecialized + schedules = [ + [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto], + [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized], + [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized], + [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized] ] else: - kernel_schedules = [ - KernelScheduleType.ScheduleAuto, - KernelScheduleType.TmaWarpSpecialized + schedules = [ + [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto], + [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized] # TmaWarpSpecializedCooperative and TmaWarpSpecializedPingpong require CUDA version >= 12.1 for optimal performance. ] - schedules = [[s, EpilogueScheduleType.ScheduleAuto] for s in kernel_schedules] - CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules) # persistent kernels with TMA epilogues @@ -4140,6 +4139,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version): CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, [[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized], [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]]) + # Emit instance without C allocation+load + data_type["c_type"] = DataType.void + CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, + [[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized], + [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]]) # for mixed precision kernels, also generate kernels that write output matrix in the A/B format # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) @@ -4166,6 +4170,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version): CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, [[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized], [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]]) + # Emit instance without C allocation+load + data_type_mixed["c_type"] = DataType.void + CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed, + [[KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.TmaWarpSpecialized], + [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]]) # def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version): @@ -4212,19 +4221,32 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version): "acc_type" : math_inst.element_accumulator, "epi_type" : math_inst.element_accumulator } + + schedules = [ + [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto], + [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized] + ] + + # TMA kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides), + # because they use NN kernels underneath and transposing its epilogue will get the correct output + schedules_transposed_epilogue = [ + [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed], + [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.EpilogueTransposed] + ] + # TMA kernels with TN or NN layout layouts_tf32_tn_nn = [layouts_tf32[0], layouts_tf32[2]] - CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32) + CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32, schedules) # TMA kernels with NT layout, only support 64x128x32 tile for now. layouts_tf32_nt = [layouts_tf32[3]] tile_64x128x32_descriptions = [tile_descriptions[0], tile_descriptions[1], tile_descriptions[2]] - CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32) + tile_128x128x32_descriptions = [tile_descriptions[3], tile_descriptions[4], tile_descriptions[5]] + CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32, schedules) + CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_tf32, [schedules[1]]) - # TMA kernels with TT layout use EpilogueTransposed, because swapping NN kernel and transposed its epilogue will get the kernel layouts_tf32_tt = [layouts_tf32[1]] - CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32, - [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]]) + CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32, schedules_transposed_epilogue) # F32 kernel share same settings with tf32 I/O kernels excluding data type data_type_f32 = { @@ -4236,10 +4258,10 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version): "epi_type" : DataType.f32 } - CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32) - CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32) - CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32, - [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]]) + CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32, schedules) + CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32, schedules) + CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_f32, [schedules[1]]) + CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32, schedules_transposed_epilogue) # def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version): @@ -4910,8 +4932,8 @@ def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version): # def GenerateSM90(manifest, cuda_version): GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version) - GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version) GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version) + GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version) GenerateSM90_TensorOp_1684(manifest, cuda_version) GenerateSM90_TensorOp_1684_complex(manifest, cuda_version) GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version) diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py index b12de786f..2f0fcfb4c 100644 --- a/tools/library/scripts/library.py +++ b/tools/library/scripts/library.py @@ -40,6 +40,7 @@ class GeneratorTarget(enum.Enum): # class DataType(enum.Enum): + void = enum_auto() # primarily used to disable C tensor for epilogues b1 = enum_auto() u4 = enum_auto() u8 = enum_auto() @@ -89,6 +90,7 @@ class DataType(enum.Enum): # DataTypeNames = { + DataType.void: "void", DataType.b1: "b1", DataType.u4: "u4", DataType.u8: "u8", @@ -121,10 +123,11 @@ class DataType(enum.Enum): DataType.cs8: "cs8", DataType.cs16: "cs16", DataType.cs32: "cs32", - DataType.cs64: "cs64", + DataType.cs64: "cs64", } DataTypeTag = { + DataType.void: "void", DataType.b1: "cutlass::uint1b_t", DataType.u4: "cutlass::uint4b_t", DataType.u8: "uint8_t", @@ -161,6 +164,7 @@ class DataType(enum.Enum): } DataTypeSize = { + DataType.void: 0, DataType.b1: 1, DataType.u4: 4, DataType.u8: 8, @@ -765,6 +769,7 @@ class TileDescription: def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1]): self.threadblock_shape = threadblock_shape + self.tile_shape = threadblock_shape self.stages = stages self.warp_count = warp_count self.math_instruction = math_instruction diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py index 8b53be413..48fdccb56 100644 --- a/tools/library/scripts/manifest.py +++ b/tools/library/scripts/manifest.py @@ -240,7 +240,9 @@ def __init__(self, args = None): self.kernel_filter_list = [] else: self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file) - + _LOGGER.info("Using {filter_count} kernel filters from {filter_file}".format( + filter_count = len(self.kernel_filter_list), + filter_file = args.kernel_filter_file)) self.operation_count = 0 self.operations_by_name = {} @@ -311,19 +313,29 @@ def filter(self, operation): # compare against the include list for name_substr in self.kernel_names: if self._filter_string_matches(name_substr, name): + _LOGGER.debug("Kernel {kernel} included due to filter string '{filt}'.".format( + kernel = operation.procedural_name(), + filt = name_substr)) enabled = True break # compare against the exclude list for name_substr in self.ignore_kernel_names: if self._filter_string_matches(name_substr, name): + _LOGGER.debug("Kernel {kernel} ignored due to filter string '{filt}'.".format( + kernel = operation.procedural_name(), + filt = name_substr)) enabled = False break if len(self.kernel_filter_list) > 0: - enabled = False if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list): - enabled = True + _LOGGER.debug("Kernel {kernel} matched via kernel filter file.".format(kernel = operation.procedural_name())) + enabled = True + else: + _LOGGER.debug("Kernel {kernel} culled due to no match in kernel filter file.".format(kernel = operation.procedural_name())) + enabled = False + # todo: filter based on compute data type return enabled @@ -389,6 +401,8 @@ def emit(self, target = GeneratorTarget.Library): for operation_kind, configurations in self.operations.items(): with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter: for configuration_name, operations in configurations.items(): + _LOGGER.info("Emitting {config} with {num_ops} operations.".format( + config = configuration_name, num_ops = len(operations))) operation_kind_emitter.emit(configuration_name, operations) source_files += operation_kind_emitter.source_files diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h index 5423edda7..4e4e09d2e 100644 --- a/tools/library/src/library_internal.h +++ b/tools/library/src/library_internal.h @@ -64,6 +64,10 @@ namespace library { template struct NumericTypeMap; +template <> struct NumericTypeMap { + static NumericTypeID const kId = NumericTypeID::kVoid; +}; + template <> struct NumericTypeMap { static NumericTypeID const kId = NumericTypeID::kB1; }; diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt index 1675d42c4..368f8b9a6 100644 --- a/tools/profiler/CMakeLists.txt +++ b/tools/profiler/CMakeLists.txt @@ -107,15 +107,17 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM --operation=Symm --provid cutlass_add_executable_tests( test_profiler cutlass_profiler DEPENDEES test_all - TEST_COMMAND_OPTIONS - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM - CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM + TEST_COMMAND_OPTIONS + GEMM + CONV2D + CONV3D + SPGEMM + RANK_K + RANK_2K + TRMM + SYMM + TEST_COMMAND_OPTIONS_PREFIX + CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_ DISABLE_EXECUTABLE_INSTALL_RULE ) diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu index 026ffdf0a..a4f377808 100644 --- a/tools/profiler/src/cutlass_profiler.cu +++ b/tools/profiler/src/cutlass_profiler.cu @@ -124,7 +124,7 @@ int CutlassProfiler::operator()() { options_.execution_mode == ExecutionMode::kTrace) { // Profiles all operations - profile_(); + return profile_(); } else if (options_.execution_mode == ExecutionMode::kEnumerate) { // Enumerates all operations @@ -157,7 +157,7 @@ int CutlassProfiler::profile_() { if (result) { return result; - } + } } } diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu index f464ccb7a..600950e2d 100644 --- a/tools/profiler/src/device_allocation.cu +++ b/tools/profiler/src/device_allocation.cu @@ -462,6 +462,13 @@ size_t DeviceAllocation::bytes() const { /// Copies from an equivalent-sized tensor in device memory void DeviceAllocation::copy_from_device(void const *ptr) { + if (!bytes()) { +#ifndef NDEBUG + std::cout << "Skipping copy of size 0 allocation\n"; +#endif + return; + } + cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice); if (result != cudaSuccess) { throw std::runtime_error("Failed device-to-device copy"); @@ -470,22 +477,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) { /// Copies from an equivalent-sized tensor in device memory void DeviceAllocation::copy_from_host(void const *ptr) { + if (!bytes()) { +#ifndef NDEBUG + std::cout << "Skipping copy of size 0 allocation\n"; +#endif + return; + } + cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice); if (result != cudaSuccess) { - throw std::runtime_error("Failed device-to-device copy"); + throw std::runtime_error("Failed host-to-device copy"); } } /// Copies from an equivalent-sized tensor in device memory void DeviceAllocation::copy_to_host(void *ptr) { + if (!bytes()) { +#ifndef NDEBUG + std::cout << "Skipping copy of size 0 allocation\n"; +#endif + return; + } + cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost); if (result != cudaSuccess) { - throw std::runtime_error("Failed device-to-device copy"); + throw std::runtime_error("Failed device-to-host copy"); } } void DeviceAllocation::initialize_random_device(int seed, Distribution dist) { - if (!good()) { + if (!bytes()) { +#ifndef NDEBUG + std::cout << "Skipping initialization of size 0 allocation\n"; +#endif + return; + } + + if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } @@ -690,7 +718,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) { } void DeviceAllocation::initialize_random_host(int seed, Distribution dist) { - if (!good()) { + if (!bytes()) { +#ifndef NDEBUG + std::cout << "Skipping initialization of size 0 allocation\n"; +#endif + return; + } + + if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } @@ -699,7 +734,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) { switch (type_) { case library::NumericTypeID::kFE4M3: cutlass::reference::host::BlockFillRandom( - reinterpret_cast(pointer_), + reinterpret_cast(host_data.data()), capacity_, seed, dist @@ -707,7 +742,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) { break; case library::NumericTypeID::kFE5M2: cutlass::reference::host::BlockFillRandom( - reinterpret_cast(pointer_), + reinterpret_cast(host_data.data()), capacity_, seed, dist @@ -904,7 +939,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) { } void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) { - if (!good()) { + if (!bytes()) { +#ifndef NDEBUG + std::cout << "Skipping initialization of size 0 allocation\n"; +#endif + return; + } + + if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } @@ -934,7 +976,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz } void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) { - if (!good()) { + if (!bytes()) { +#ifndef NDEBUG + std::cout << "Skipping initialization of size 0 allocation\n"; +#endif + return; + } + + if (!data()) { throw std::runtime_error("Attempting to initialize invalid allocation."); } diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu index 595c90847..a622e048a 100644 --- a/tools/profiler/src/gemm_operation_profiler.cu +++ b/tools/profiler/src/gemm_operation_profiler.cu @@ -68,6 +68,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options): {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"}, {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"}, {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"}, + {ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"}, {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"}, {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"}, {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"}, @@ -206,6 +207,10 @@ Status GemmOperationProfiler::GemmProblem::parse( return Status::kErrorInvalidProblem; } + if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) { + return Status::kErrorInvalidProblem; + } + if (!arg_as_scalar( this->alpha, operation_desc.element_epilogue, @@ -307,6 +312,9 @@ void GemmOperationProfiler::GemmProblem::initialize_result( set_argument(result, "C", problem_space, std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout)); + set_argument(result, "D", problem_space, + std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout)); + set_argument(result, "m", problem_space, m); set_argument(result, "n", problem_space, n); set_argument(result, "k", problem_space, k); @@ -537,8 +545,6 @@ Status GemmOperationProfiler::initialize_workspace( problem_.batch_count * gemm_workspace_.problem_count ); - gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data()); - // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)}; gemm_workspace_.arguments.batch_count = problem_.batch_count; diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu index b2e8f9b74..737821c19 100644 --- a/tools/profiler/src/operation_profiler.cu +++ b/tools/profiler/src/operation_profiler.cu @@ -270,17 +270,17 @@ int OperationProfiler::profile_all( ProblemSpace::Iterator problem_it = problem_space.begin(); ProblemSpace::Iterator problem_end = problem_space.end(); - bool continue_profiling = true, internal_error = false; + bool continue_profiling = true; + int retval = 0; // For each problem in problem space for (; continue_profiling && problem_it != problem_end; ++problem_it) { - ProblemSpace::Problem problem = problem_it.at(); - report.next_problem(); // For each operation in manifest - for (auto const & operation_ptr : manifest) { + int matched_operation_count = 0; + for (auto const& operation_ptr : manifest) { library::Operation const *operation = operation_ptr.get(); @@ -292,8 +292,8 @@ int OperationProfiler::profile_all( // Execute compatible cutlass operations if they satisfy the current device's compute capability if (operation->description().kind == kind_ && - operation->description().provider == library::Provider::kCUTLASS && - options.device.compute_capability() >= min_cc && + operation->description().provider == library::Provider::kCUTLASS && + options.device.compute_capability() >= min_cc && options.device.compute_capability() <= max_cc) { std::string operation_name(operation->description().name); @@ -320,7 +320,10 @@ int OperationProfiler::profile_all( if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) { continue; } - + + // we have found a kernel match, so increment the counter for match kernels + ++matched_operation_count; + // A. Initialize configuration Status status = this->initialize_configuration( options, @@ -374,7 +377,6 @@ int OperationProfiler::profile_all( // // B. Verify CUTLASS - if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) { continue_profiling = this->verify_cutlass( @@ -426,10 +428,18 @@ int OperationProfiler::profile_all( if (!continue_profiling) { break; } - } + } + + // If we did not find any kernels that match our filters and error_on_no_match was set, report an error + if (options.profiling.error_on_no_match && matched_operation_count <= 0) { + #if !NDEBUG + std::cout << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl; + #endif + retval = 1; + } } - return internal_error ? 1 : 0; + return retval; } /////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu index ab2b4ed0a..05f653024 100644 --- a/tools/profiler/src/options.cu +++ b/tools/profiler/src/options.cu @@ -706,10 +706,12 @@ Options::Options(cutlass::CommandLine const &cmdline): } else if (cmdline.check_cmd_line_flag("kernels")) { cmdline.get_cmd_line_arguments("kernels", operation_names); + profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match"); } if (cmdline.check_cmd_line_flag("ignore-kernels")) { cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names); + profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match"); } // Prevent launches on the device for anything other than CUTLASS operation diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h index eba0172f6..d679c70ee 100644 --- a/tools/profiler/src/options.h +++ b/tools/profiler/src/options.h @@ -196,6 +196,9 @@ class Options { /// If true, profiling is actually conducted. bool enabled; + /// If true, profiling returns an error code if no kernels are found to match the filters. + bool error_on_no_match = false; + /// List of providers of each functionality to be profiled ProviderVector providers;