[Proposal] Codegen Design #62

KuangjuX · 2023-12-29T04:51:48Z

KuangjuX
Dec 29, 2023
Collaborator

代码生成设计提案

本节以 Nvidia 平台上的 GEMM 代码生成为例，讨论代码生成提案。

基于子图的生成

默认的算子级在全局内存进行生成，即给定了 input 与 output 张量，生成 __global__ 级别的 kernel 并对整个计算图进行连接。然而，有时候我们需要对与不同算子进行融合以减少同步与内存的开销。因此我们需要引入子图结构，并在子图中规定融合的级别，例如：

class SubGraph {
    std::vector<std::variant<SubGraph, Operator>> subgraphs;
    MemoryLevel mem_level;
}

其中 subgraphs 一个子图递归结构，下一级既可以是一个子图，也可以是一个单独的算子，并在融合的内存级进行逐层下降。

原语设计

这个代码生成提案希望设计不同的原语，通过组合以生成不同的融合算子的代码生成。接下来以 Nvidia 在 Tensor Core 以及 Cute，Cutlass 等实现来进行说明。Tensor Core，Cute，Cutlass 的易用性从上到下一次递增，但是灵活度则依次递减。

Tensor Core

GEMM

__global__ void matmul(half *A, half *B, half *C, int M, int N, int K,
                       float alpha, float beta) {
    // A is row-major
    // B is col-major
    // 128 threads [x, y, z] = [32, 2, 2]
    // threadblock mma: 128x128x32
    // warp mma: 64x64x16
    extern __shared__ uint8_t shared_storage[];
    half *SA = reinterpret_cast<half *>(shared_storage);
    half *SB =
        reinterpret_cast<half *>(shared_storage + MI * KI * sizeof(half));
    float *SC = reinterpret_cast<float *>(shared_storage);

    // Frag A 被分成 MII / WmmaM 个片段
    nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, wmmaM, wmmaN, wmmaK, half,
                           nvcuda::wmma::row_major>
        FragA[MII / wmmaM];

    // Frag B 被分成 NII / WmmaN 个片段
    nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, wmmaM, wmmaN, wmmaK, half,
                           nvcuda::wmma::col_major>
        FragB[NII / wmmaN];

    // 累加器被分为 MII / WmmaM * NII / WmmaN 个片段
    nvcuda::wmma::fragment<nvcuda::wmma::accumulator, wmmaM, wmmaN, wmmaK,
                           float>
        Accum[MII / wmmaM * NII / wmmaN];

    // 初始化累加器
    for (int mii = 0; mii < MII / wmmaM; mii += 1) {
        for (int nii = 0; nii < NII / wmmaN; nii += 1) {
            nvcuda::wmma::fill_fragment(Accum[mii * (NII / wmmaN) + nii], 0.0);
        }
    }

    // 沿着 K 纬度进行切分
    for (int ko = 0; ko < K / KI; ko += 1) {
        // 加载共享内存
        loadSmemA(SA, A, M, K, ko);
        loadSmemB(SB, B, N, K, ko);
        __syncthreads();
        // 沿着 KTILE 进行迭代
        for (int ki = 0; ki < KI / KII; ki += 1) {
            // 64x64x16 mma for each warp
            // 加载片段
            loadFragA(FragA, SA, ki);
            loadFragB(FragB, SB, ki);
            for (int mii = 0; mii < MII / wmmaM; mii += 1) {
                for (int nii = 0; nii < NII / wmmaN; nii += 1) {
                    // 16x16x16 for each wmma
                    nvcuda::wmma::mma_sync(Accum[mii * (NII / wmmaN) + nii],
                                           FragA[mii], FragB[nii],
                                           Accum[mii * (NII / wmmaN) + nii]);
                }
            }
        }
    }
    storeAccum(SC, Accum);
    __syncthreads();
    storeSmemC(C, SC, M, N);
}

上面是一个简单地使用 Tensor Core 的用 Sliced-K 方式实现 GEMM Kernel 的，针对这段代码，涉及到一系列原语：

CudaVar: Cuda 变量，涉及到声明、初始化、填充等操作，需要针对 Cuda 的不同内存级进行定义。
iteration: 迭代操作对象，接受迭代变量、start、end、step 作为对象内变量并可以生成迭代操作以及获取迭代变量。
sync: 同步原语操作，基于不同内存级进行生成，例如 __syncthreads() 以及 cudaDeviceSynchnorize()。
mma: MMA 是 CUDA Tensor Core 的内置操作，分为两种，一种是由 Nvidia 提供的 WMMA api，直接掉库，使用简单，但性能不如 MMA，一种是手写 CUDA PTX 实现 MMA 操作，实现困难，但性能很高。
Load/Store: 内存加载存储操作，也是最为复杂的原语设计，涉及到启动 kernel 的 thread layout 以及 memory layout，例如，为了满足 MMA 的内存 layout 要求，需要将一个 [MTILE， NTILE] 的两维度共享内存铺成 [MTILE/WmmaM, NTILE/WmmaN, WmmaM, WmmN] 的四维度内存。

GEMM fused

// GEMM + GELU + GEMM
// A: M x K row major
// B: K x N row major
// C: N x L col major
// ThreadBlock0_N = N0
// ThreadBlock1_N = N1
__global__ void gemm_gelu_gemm_fusion(half *A, half *B, half *C, half *D, int M,
                                      int N, int K, int L, float alpha,
                                      float beta) {
    // A is row-major
    // B is col-major
    // C is col-major
    // 128 threads [x, y, z] = [32, 2, 2]
    // threadblock mma: 128x128x32
    // warp mma: 64x64x16

    extern __shared__ uint8_t shared_storage[];
    half *SA = reinterpret_cast<half *>(shared_storage);
    half *SB =
        reinterpret_cast<half *>(shared_storage + MI * KI * sizeof(half));
    half *S1 = reinterpret_cast<half *>(shared_storage);

    nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, wmmaM, wmmaN, wmmaK, half,
                           nvcuda::wmma::row_major>
        FragA[MII / wmmaM];

    nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, wmmaM, wmmaN, wmmaK, half,
                           nvcuda::wmma::col_major>
        FragB[NII / wmmaN];

    nvcuda::wmma::fragment<nvcuda::wmma::accumulator, wmmaM, wmmaN, wmmaK, half>
        Accum[MII / wmmaM * NII / wmmaN];

    // 初始化累加器
    for (int mii = 0; mii < MII / wmmaM; mii += 1) {
        for (int nii = 0; nii < NII / wmmaN; nii += 1) {
            nvcuda::wmma::fill_fragment(Accum[mii * (NII / wmmaN) + nii], 0.0);
        }
    }

    // 沿着 K 纬度进行切分
    for (int ko = 0; ko < K / KI; ko += 1) {
        // 加载共享内存
        loadSmemA(SA, A, M, K, ko);
        loadSmemB(SB, B, N, K, ko);
        __syncthreads();
        // 沿着 KTILE 进行迭代
        for (int ki = 0; ki < KI / KII; ki += 1) {
            // 64x64x16 mma for each warp
            // 加载片段
            loadFragA(FragA, SA, ki);
            loadFragB(FragB, SB, ki);
            for (int mii = 0; mii < MII / wmmaM; mii += 1) {
                for (int nii = 0; nii < NII / wmmaN; nii += 1) {
                    // 16x16x16 for each wmma
                    nvcuda::wmma::mma_sync(Accum[mii * (NII / wmmaN) + nii],
                                           FragA[mii], FragB[nii],
                                           Accum[mii * (NII / wmmaN) + nii]);
                }
            }
        }
    }
    storeAccum(S1, Accum);
    __syncthreads();
    gelu(S1);
    __syncthreads();
    // S1: MI * NI
    // SC: LI * NI
    half *SC =
        reinterpret_cast<half *>(shared_storage + MI * NI * sizeof(half));
    half *S2 = reinterpret_cast<half *>(shared_storage);

    nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, wmmaM, wmmaN, wmmaK, half,
                           nvcuda::wmma::row_major>
        Frag1[MII_2 / wmmaM_2];

    nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, wmmaM, wmmaN, wmmaK, half,
                           nvcuda::wmma::col_major>
        FragC[LII_2 / wmmaL_2];

    nvcuda::wmma::fragment<nvcuda::wmma::accumulator, wmmaM, wmmaN, wmmaK, half>
        Accum2[MII_2 / wmmaM_2 * LII_2 / wmmaL_2];

    for (int no = 0; no < N / NI_2; no += 1) {
        loadSmemC<half>(SC, C, N, L);
        __syncthreads();
        for (int ni = 0; ni < NI_2 / NII_2; ni += 1) {
            // TODO: Load Warp Tile.
            loadFragA(Frag1, S1, ni);
            loadFragB(FragC, SC, ni);
            for (int mii = 0; mii < MII_2 / wmmaM_2; mii += 1) {
                for (int lii = 0; lii < LII_2 / wmmaL_2; lii += 1) {
                    nvcuda::wmma::mma_sync(
                        Accum2[mii * (LII_2 / wmmaL_2) + lii], Frag1[mii],
                        FragC[lii], Accum2[mii * (LII_2 / wmmaL_2) + lii]);
                }
            }
        }
    }

    storeAccum(S2, Accum2);
    __syncthreads();
    storeSmemAccum<half>(D, S2, M, L);
}

这是一个手写的 GEMM 在共享内存层级融合 GELU 以及 GEMM 的 Tensor Core 实现。为了生成上述的代码，我们需要指定融合的层级，并规定输出输出的变量层级。

例如，对于一阶段的 GEMM 来说，两个输入变量都是全局内存，输出变量则为共享内存，这意味着对于第一阶段 GEMM 我们不应当进行同步，而应该生成到共享内存就停止生成。

对于第二阶段的共享内存，两个输入变量的 A 为共享内存，B 为全局内存，C 为共享内存，融合级别为共享内存级，这意味着 A 不需要从全局内存进行加载，只需要将 B 加载入共享内存并重新执行一遍 Sliced-K 的 GEMM 即可。当然，这要求第一阶段 GEMM 与第二阶段 GEMM 有相同的 threadBlock 组织，即 ThreadBlock_0 = ThreadBlock_1。

TVM 支持 Tensor Core 相关：

4136 这个 PR 主要是增加了一些 mma_sync, fill_fragment 这些原语。4105 这个 issue 一开始的提案是通过 AST 的形状来判断，但是这样并不能准确地识别出 Tensor Core，由于在 4136 里已经提出了新的 intrinsics，而陈天奇认为这是一类新的硬件，应该使用新的原语做 codegen。4234 基于原语重新组织 AST 进行生成。

Cute

TODO

Cutlass

// The code section below describes datatype for input, output matrices and
// computation between elements in input matrices.
using ElementAccumulator = float;  // <- data type of accumulator
using ElementComputeEpilogue =
    ElementAccumulator;  // <- data type of epilogue operations
using ElementInputA =
    cutlass::half_t;  // <- data type of elements in input matrix A
using ElementInputB =
    cutlass::half_t;          // <- data type of elements in input matrix B
using ElementOutput = float;  // <- data type of elements in output matrix D

// The code section below describes matrix layout of input and output matrices.
// Column Major for Matrix A, Row Major for Matrix B and Row Major for Matrix C
using LayoutInputA = cutlass::layout::ColumnMajor;
using LayoutInputB = cutlass::layout::ColumnMajor;
using LayoutOutput = cutlass::layout::ColumnMajor;

// This code section describes whether you want to use tensor cores or regular
// SIMT cores on GPU SM
using MMAOp = cutlass::arch::OpClassTensorOp;

// This code section describes CUDA SM architecture number
using SmArch = cutlass::arch::Sm80;

// This code section describes the tile size a thread block will compute
using ShapeMMAThreadBlock =
    cutlass::gemm::GemmShape<128, 256, 64>;  // <- threadblock tile M = 128, N =
                                             // 128, K = 32
// This code section describes tile size a warp will compute
using ShapeMMAWarp =
    cutlass::gemm::GemmShape<64, 64,
                             64>;  // <- warp tile M = 64, N = 64, K = 32
// This code section describes the size of MMA op
using ShapeMMAOp =
    cutlass::gemm::GemmShape<16, 8, 16>;  // <- MMA Op tile M = 8, N = 8, K = 4

// This code section describes how threadblocks are scheduled on GPU
using SwizzleThreadBlock =
    cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>;  // <- ??

// This code section describes ?
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
    ElementOutput,  // <- data type of output matrix
    128 / cutlass::sizeof_bits<
              ElementOutput>::value,  // <- this is the number of elements per
                                      // vectorized memory access. For half
                                      // precision, it's 8 elements. This
                                      // becomes the vector width of math
                                      // instructions in epilogue too
    ElementAccumulator,               // <- data type of accumulator
    ElementComputeEpilogue>;          // <- data type for alpha/beta in linear
                                      // combination function

// Number of pipelines you want to use
constexpr int NumStages = 2;

using Gemm = cutlass::gemm::device::Gemm<
    ElementInputA, LayoutInputA, ElementInputB, LayoutInputB, ElementOutput,
    LayoutOutput, ElementAccumulator, MMAOp, SmArch, ShapeMMAThreadBlock,
    ShapeMMAWarp, ShapeMMAOp, EpilogueOp, SwizzleThreadBlock, NumStages>;

Performance Evluation

GEMM	TFLOPS	MxNxK
cutlass(column, row, row)	8.32607	5376x5376x2048
cublas(colmun, row, row)	16.7653	5376x5376x2048
cutlass(column, column, column)	7.78662	5376x5376x2048
cublas(colmun, column, column)	17.0847	5376x5376x2048
cutlass(column, column, column)	7.734823	5376x5376x4096
cublas(colmun, column, column)	17.6964	5376x5376x4096
cutlass(column, column, column)	7.22182	5376x5376x5376
cublas(colmun, column, column)	17.7175	5376x5376x5375

TVM 支持 cutlass 相关：

PR 9261 介绍了如何通过 BYOC 在 tvm 引入 cutlass 的代码生成，这个之前在 9147 的讨论中讨论过。后端增加的代码在 src/relay/backend/contrib/cutlass/codegen.cc 中：

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*!
 * \file src/relay/backend/contrib/cutlass/codegen.cc
 * \brief Implementation of CUTLASS codegen.
 */

#include <tvm/relay/attrs/nn.h>
#include <tvm/relay/expr_functor.h>
#include <tvm/relay/transform.h>
#include <tvm/relay/type.h>
#include <tvm/runtime/module.h>
#include <tvm/runtime/registry.h>

#include <fstream>
#include <numeric>
#include <sstream>

#include "../../utils.h"
#include "../codegen_c/codegen_c.h"

namespace tvm {
namespace relay {
namespace contrib {

using namespace backend;
using Str2StrMap = std::unordered_map<std::string, std::string>;

static Str2StrMap dtype_map = {{"float16", "cutlass::half_t"}, {"float32", "float"}};

Str2StrMap DenseArgs(const Map<String, ObjectRef>& attrs) {
  Str2StrMap args;
  auto arg0_dtype = std::string(attrs["arg0_dtype"].as<StringObj>()->data);
  auto arg1_dtype = std::string(attrs["arg1_dtype"].as<StringObj>()->data);
  auto ret_dtype = std::string(attrs["ret_dtype"].as<StringObj>()->data);
  auto arg0_shape = attrs["arg0_shape"].as<ArrayNode>();
  auto arg1_shape = attrs["arg1_shape"].as<ArrayNode>();
  args["ElementInputA"] = dtype_map.at(arg0_dtype);
  args["ElementInputB"] = dtype_map.at(arg1_dtype);
  args["ElementOutput"] = dtype_map.at(ret_dtype);
  args["M"] = std::to_string(arg0_shape->at(0).as<IntImmNode>()->value);
  args["K"] = std::to_string(arg0_shape->at(1).as<IntImmNode>()->value);
  args["N"] = std::to_string(arg1_shape->at(0).as<IntImmNode>()->value);
  args["op_def"] = std::string(attrs["cutlass_op_def"].as<StringObj>()->data);
  args["op_name"] = std::string(attrs["cutlass_op_name"].as<StringObj>()->data);
  args["op_type"] = std::string(attrs["op_type"].as<StringObj>()->data);
  args["lda"] = std::string(attrs["lda"].as<StringObj>()->data);
  args["ldb"] = std::string(attrs["ldb"].as<StringObj>()->data);
  args["ldc"] = std::string(attrs["ldc"].as<StringObj>()->data);
  return args;
}

inline void CutlassPrint(std::ostringstream& os, const std::string& stmt, int indent = 2) {
  for (int i = 0; i < indent; ++i) {
    os << " ";
  }
  os << stmt;
}

std::string DenseOp(std::string id, const Str2StrMap& attrs,
                    const std::vector<std::string>& func_args) {
  bool has_bias = false;
  bool is_gelu =
      attrs.at("op_type").find("cutlass.dense_bias_gelu") != std::string::npos;  // fp32 or fp16
  if (attrs.at("op_type") == "cutlass.dense_bias" ||
      attrs.at("op_type") == "cutlass.dense_bias_relu" || is_gelu) {
    has_bias = true;
  }
  std::ostringstream gemm_decl;
  CutlassPrint(gemm_decl, "using ElementInputA = " + attrs.at("ElementInputA") + ";\n");
  CutlassPrint(gemm_decl, "using ElementInputB = " + attrs.at("ElementInputB") + ";\n");
  CutlassPrint(gemm_decl, "using ElementOutput = " + attrs.at("ElementOutput") + ";\n");
  CutlassPrint(gemm_decl, "using ElementComputeEpilogue = " + attrs.at("ElementOutput") + ";\n");
  CutlassPrint(gemm_decl, attrs.at("op_def"));
  CutlassPrint(gemm_decl, "using Gemm = Operation_" + attrs.at("op_name") + ";\n");
  /// Gemm Call

  // Create TensorRef
  CutlassPrint(gemm_decl, "int M = " + attrs.at("M") + ";\n");
  CutlassPrint(gemm_decl, "int N = " + attrs.at("N") + ";\n");
  CutlassPrint(gemm_decl, "int K = " + attrs.at("K") + ";\n");
  CutlassPrint(gemm_decl, "cutlass::gemm::GemmCoord problem_size(M, N, K);\n");
  // Initialize alpha for dot product computation
  CutlassPrint(gemm_decl, "ElementComputeEpilogue alpha = ElementComputeEpilogue(1);\n");
  if (is_gelu) {
    // GeLU epilogue does not compile with NoBetaScaling, so we explicitly specify the scale.
    CutlassPrint(gemm_decl, "ElementComputeEpilogue beta = ElementComputeEpilogue(1);\n");
  } else {
    CutlassPrint(gemm_decl, "ElementComputeEpilogue beta = ElementComputeEpilogue(0);\n");
  }

  // Split K dimension into 1 partitions
  CutlassPrint(gemm_decl, "int split_k_slices = 1;\n");

  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
  // instantiated CUTLASS kernel
  ICHECK(func_args.size() >= 2);
  CutlassPrint(gemm_decl, "void* ptr_a = (void*)(" + func_args[0] + ");\n");
  CutlassPrint(gemm_decl, "void* ptr_b = (void*)(" + func_args[1] + ");\n");
  if (has_bias) {
    ICHECK(func_args.size() >= 3);
    CutlassPrint(gemm_decl, "void* ptr_c_bias = (void*)(" + func_args[2] + ");\n");
  }
  CutlassPrint(gemm_decl, "void* ptr_out = (void*)(out0);\n");

  CutlassPrint(gemm_decl, "typename Gemm::Arguments arguments{\n");
  CutlassPrint(gemm_decl, " problem_size,\n");
  CutlassPrint(gemm_decl, " {static_cast<ElementInputA*>(ptr_a), " + attrs.at("lda") + "},\n");
  CutlassPrint(gemm_decl, " {static_cast<ElementInputB*>(ptr_b), " + attrs.at("ldb") + "},\n");
  if (has_bias) {
    CutlassPrint(gemm_decl, " {static_cast<ElementOutput*>(ptr_c_bias), 0},\n");
  } else {
    CutlassPrint(gemm_decl, " {static_cast<ElementOutput*>(ptr_out), " + attrs.at("ldc") + "},\n");
  }
  CutlassPrint(gemm_decl, " {static_cast<ElementOutput*>(ptr_out), " + attrs.at("ldc") + "},\n");
  if (has_bias && !is_gelu) {
    CutlassPrint(gemm_decl, " {alpha},\n");
  } else {
    // For GeLU, we explicitly specify the scale.
    CutlassPrint(gemm_decl, " {alpha, beta},\n");
  }
  CutlassPrint(gemm_decl, " split_k_slices};\n");

  // Using the arguments, query for extra workspace required for matrix multiplication computation
  CutlassPrint(gemm_decl, "size_t workspace_size = Gemm::get_workspace_size(arguments);\n");
  // Allocate workspace memory
  CutlassPrint(gemm_decl,
               "cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);\n");
  // Instantiate CUTLASS kernel depending on template
  CutlassPrint(gemm_decl, "Gemm gemm_op;\n");
  // Check the problem size is supported or not
  CutlassPrint(gemm_decl, "cutlass::Status status = gemm_op.can_implement(arguments);\n");
  CutlassPrint(gemm_decl, "CHECK(status == cutlass::Status::kSuccess);\n");
  // Initialize CUTLASS kernel with arguments and workspace pointer
  CutlassPrint(gemm_decl, "status = gemm_op.initialize(arguments, workspace.get());\n");
  CutlassPrint(gemm_decl, "CHECK(status == cutlass::Status::kSuccess);\n");
  // Launch initialized CUTLASS kernel
  CutlassPrint(gemm_decl, "status = gemm_op();\n");
  CutlassPrint(gemm_decl, "CHECK(status == cutlass::Status::kSuccess);\n");
  return gemm_decl.str();
}

class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, public CodegenCBase {
 public:
  CodegenCutlass(const std::string& id, const Map<String, ObjectRef>& attrs) {
    this->ext_func_id_ = id;
    this->attrs_ = attrs;
  }

  std::vector<Output> VisitExprDefault_(const Object* op) final {
    LOG(FATAL) << "Cutlass codegen doesn't support: " << op->GetTypeKey();
    return {};
  }

  std::vector<Output> VisitExpr_(const VarNode* node) final {
    ext_func_args_.push_back(GetRef<Var>(node));
    Output output;
    output.name = node->name_hint();
    return {output};
  }

  std::vector<Output> VisitExpr_(const CallNode* call) final {
    const auto* func = call->op.as<FunctionNode>();
    ICHECK(func) << "Only composite function is supported for CUTLASS.";
    GenerateBodyOutput ret = GenerateCompositeFunctionCall(func, call);
    ext_func_body_.push_back(ret.decl);
    return ret.outputs;
  }

  std::string JIT(const std::vector<Output>& out) {
    return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body_, const_array_name_, out);
  }

 private:
  std::vector<std::string> GetArgumentNames(const CallNode* call) {
    std::vector<std::string> arg_names;
    for (size_t i = 0; i < call->args.size(); ++i) {
      auto res = VisitExpr(call->args[i]);
      for (const auto& out : res) {
        arg_names.push_back(out.name);
      }
    }
    return arg_names;
  }

  GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee,
                                                   const CallNode* caller) {
    const auto pattern_name = callee->GetAttr<runtime::String>(attr::kComposite);
    ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";

    if (pattern_name == "cutlass.dense") {
      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, {"nn.dense"});
      return GenerateBody(dense_call, "cutlass_dense", GetArgumentNames(caller),
                          DenseArgs(std::ref(attrs_)));
    } else if (pattern_name == "cutlass.dense_bias") {
      const CallNode* current_call = callee->body.as<CallNode>();
      std::string add_or_bias_add = current_call->op.as<OpNode>()->name;
      const auto* dense_call =
          GetRootCall(callee->body.as<CallNode>(), 1, {"nn.dense", add_or_bias_add});
      return GenerateBody(dense_call, "cutlass_dense_bias", GetArgumentNames(caller),
                          DenseArgs(std::ref(attrs_)));
    } else if (pattern_name == "cutlass.dense_bias_relu") {
      const CallNode* current_call = callee->body.as<CallNode>();
      std::string add_or_bias_add = current_call->args[0].as<CallNode>()->op.as<OpNode>()->name;
      const auto* dense_call =
          GetRootCall(callee->body.as<CallNode>(), 2, {"nn.dense", add_or_bias_add, "nn.relu"});
      return GenerateBody(dense_call, "cutlass_dense_bias_relu", GetArgumentNames(caller),
                          DenseArgs(std::ref(attrs_)));
    } else if (pattern_name == "cutlass.dense_bias_gelu_fp16") {
      const CallNode* current_call = callee->body.as<CallNode>();
      std::string add_or_bias_add = current_call->args[1].as<CallNode>()->op.as<OpNode>()->name;
      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 8,
                                           {"nn.dense", add_or_bias_add, "multiply", "cast", "erf",
                                            "cast", "multiply", "add", "multiply"});
      return GenerateBody(dense_call, "cutlass_dense_bias_gelu", GetArgumentNames(caller),
                          DenseArgs(std::ref(attrs_)));
    } else if (pattern_name == "cutlass.dense_bias_gelu_fp32") {
      const CallNode* current_call = callee->body.as<CallNode>();
      std::string add_or_bias_add = current_call->args[1].as<CallNode>()->op.as<OpNode>()->name;
      const auto* dense_call = GetRootCall(
          callee->body.as<CallNode>(), 6,
          {"nn.dense", add_or_bias_add, "multiply", "erf", "multiply", "add", "multiply"});
      return GenerateBody(dense_call, "cutlass_dense_bias_gelu", GetArgumentNames(caller),
                          DenseArgs(std::ref(attrs_)));
    }
    LOG(FATAL) << "Unknown composite function: " << pattern_name;
    return {};
  }

  GenerateBodyOutput GenerateBody(const CallNode* root_call, const std::string& func_name,
                                  const std::vector<std::string>& func_args,
                                  const Str2StrMap& attribute_args) {
    // Make function call with input buffers when visiting arguements
    ICHECK_GT(func_args.size(), 0);
    std::ostringstream decl_stream;
    decl_stream << "(" << func_args[0];
    for (size_t i = 1; i < func_args.size(); ++i) {
      decl_stream << ", " << func_args[i];
    }
    // Analyze the output buffers
    std::vector<Type> out_types;
    if (root_call->checked_type()->IsInstance<TupleTypeNode>()) {
      auto type_node = root_call->checked_type().as<TupleTypeNode>();
      for (auto field : type_node->fields) {
        ICHECK(field->IsInstance<TensorTypeNode>());
        out_types.push_back(field);
      }
    } else if (root_call->checked_type()->IsInstance<TensorTypeNode>()) {
      ICHECK(root_call->checked_type()->IsInstance<TensorTypeNode>());
      out_types.push_back(root_call->checked_type());
    } else {
      LOG(FATAL) << "Unrecognized type node: " << AsText(root_call->checked_type(), false);
    }
    GenerateBodyOutput ret;
    for (const auto& out_type : out_types) {
      const std::string out = "out" + std::to_string(buf_idx_++);
      decl_stream << ", " << out;
      Output output;
      output.name = out;
      output.dtype = GetDtypeString(out_type.as<TensorTypeNode>());
      output.need_copy = false;
      ret.outputs.push_back(output);
    }
    decl_stream << ");";
    if (func_name == "cutlass_dense" || func_name == "cutlass_dense_bias" ||
        func_name == "cutlass_dense_bias_relu" || func_name == "cutlass_dense_bias_gelu") {
      ret.decl = DenseOp(ext_func_id_, attribute_args, func_args);
    }
    return ret;
  }
  /*! \brief The id of the external cutlass ext_func. */
  std::string ext_func_id_{""};
  /*! \brief The attrs of the external cutlass ext_func. */
  Map<String, ObjectRef> attrs_;
  /*!
   * \brief The index to track the output buffer. Each kernel will redirect the
   * output to a buffer that may be consumed by other kernels.
   */
  int buf_idx_{0};
  /*! \brief The arguments used by a wrapped function that calls CUTLASS kernels. */
  Array<Var> ext_func_args_;
  /*! \brief Statement of the function that will be compiled using CUTLASS kernels. */
  std::vector<std::string> ext_func_body_;
  /*! \brief The array declared to store the constant values. */
  std::string const_array_name_;
  /*! \brief The declaration of intermediate buffers. */
  std::vector<std::string> buf_decl_;
};  // class CodegenCutlass

class CutlassModuleCodegen : public CSourceModuleCodegenBase {
 public:
  std::pair<std::string, Array<String>> GenCutlassFunc(const Function& func) {
    ICHECK(func.defined()) << "Input error: expect a Relay function.";
    // Record the external symbol for runtime lookup.
    auto sid = GetExtSymbol(func);
    const auto* attrs = func->attrs.as<DictAttrsNode>();
    ICHECK(attrs != nullptr);
    const auto dict = attrs->dict;
    CodegenCutlass builder(sid, dict);
    auto out = builder.VisitExpr(func->body);
    code_stream_ << builder.JIT(out);
    return {sid, {}};
  }

  runtime::Module CreateCSourceModule(const ObjectRef& ref) override {
    // create header
    code_stream_ << "#include <cstdint>\n";
    code_stream_ << "#include <cstdlib>\n";
    code_stream_ << "#include <cstring>\n";
    code_stream_ << "#include <vector>\n";
    code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
    code_stream_ << "#include <tvm/runtime/packed_func.h>\n";
    code_stream_ << "#include <dlpack/dlpack.h>\n";
    // cutlass header
    code_stream_ << "#include <cuda_fp16.h>\n";
    code_stream_ << "#include <cutlass/cutlass.h>\n";
    code_stream_ << "#include <cutlass/util/host_tensor.h>\n";
    code_stream_ << "#include <cutlass/util/reference/host/tensor_fill.h>\n";
    code_stream_ << "#include <cutlass/gemm/device/gemm.h>\n";
    code_stream_ << "#include <cutlass/epilogue/thread/linear_combination_bias_relu.h>\n";
    code_stream_ << "#include <cutlass/epilogue/thread/linear_combination_gelu.h>\n";

    ICHECK(ref->IsInstance<FunctionNode>());
    auto res = GenCutlassFunc(Downcast<Function>(ref));
    std::string code = code_stream_.str();
    String sym = std::get<0>(res);
    Array<String> variables = std::get<1>(res);
    // Create a CSource module
    const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
    ICHECK(pf != nullptr) << "Cannot find CSource module to create the external runtime module";
    return (*pf)(code, "cu", Array<String>{sym}, variables);
  }

 private:
  /*! \brief The code stream that will be compiled by NVCC */
  std::ostringstream code_stream_;
};  // CutlassModuleCodegen

/*!
 * \brief The external cutlass compiler/codegen tool. It takes a Relay
 * expression/module and compile it into a runtime module.
 */
runtime::Module CutlassCompiler(const ObjectRef& ref) {
  CutlassModuleCodegen cutlass;
  return cutlass.CreateCSourceModule(ref);
}

TVM_REGISTER_GLOBAL("relay.ext.cutlass").set_body_typed(CutlassCompiler);

}  // namespace contrib
}  // namespace relay
}  // namespace tvm

写法还是比较初级的，主要是基于上层传来的 attrs 来 print 到文件描述符中。同时也对简单的融合，例如 gelu，biasadd 这种做了尾声处理。GenerateCompositeFunctionCall 主要通过不同的 pattern 调用 GenerateBody, GenerateBody 再进行一些分析调用 DenseOp。

python/tvm/contrib/cutlass/gen_gemm.py 描述了如何做 cutlass 在 python 做代码生成。create_gemm_operator 的函数签名如下：

def create_gemm_operator(
    layouts,
    tile_descriptions,
    data_type,
    alignment_constraints,
    epilogue_functor=EpilogueFunctor.LinearCombination,
    swizzling_functor=SwizzlingFunctor.Identity8,
):

在 create_gemm_operator 基于 layouts, tile_descriptions 以及 alignment_contraints 进行三次迭代，看起来是为了枚举所有的基于 cutlass 的 layout 生成的代码，随后创建 GemmOperation 并进行生成：

  for layout in layouts:
        for tile_description in tile_descriptions:
            for alignment in alignment_constraints:
                alignment_c = min(8, alignment)

                A = TensorDescription(element_a, layout[0], alignment)
                B = TensorDescription(element_b, layout[1], alignment)
                C = TensorDescription(element_c, layout[2], alignment_c)

                op_entry = {}
                op = GemmOperation(
                    tile_description.minimum_compute_capability,
                    tile_description,
                    A,
                    B,
                    C,
                    element_epilogue,
                    epilogue_functor,
                    swizzling_functor,
                )
                op_bias = GemmOperation(
                    tile_description.minimum_compute_capability,
                    tile_description,
                    A,
                    B,
                    C,
                    element_epilogue,
                    EpilogueFunctor.LinearCombinationBias,
                    swizzling_functor,
                )
                op_bias_relu = GemmOperation(
                    tile_description.minimum_compute_capability,
                    tile_description,
                    A,
                    B,
                    C,
                    element_epilogue,
                    EpilogueFunctor.LinearCombinationRelu,
                    swizzling_functor,
                )
                op_bias_gelu = GemmOperation(
                    tile_description.minimum_compute_capability,
                    tile_description,
                    A,
                    B,
                    C,
                    element_epilogue,
                    EpilogueFunctor.LinearCombinationGelu,
                    swizzling_functor,
                )

                kernel_emitter = EmitGemmInstance()
                op_entry["op"] = op
                op_entry["name"] = op.procedural_name()
                op_entry["opdef"] = kernel_emitter.emit(op)
                op_entry["opdef_bias"] = kernel_emitter.emit(op_bias, no_beta_scaling=True)
                op_entry["opdef_bias_relu"] = kernel_emitter.emit(
                    op_bias_relu, no_beta_scaling=True
                )
                op_entry["opdef_bias_gelu"] = kernel_emitter.emit(op_bias_gelu)
                op_entry["src"] = profiler_emitter.emit(
                    op.procedural_name(),
                    op_entry["opdef"],
                    DataTypeTag[element_a],
                    DataTypeTag[element_b],
                    DataTypeTag[element_c],
                    op.leading_dim(),
                )
                op_entry["runtime"] = 9999999
                ret.append(op_entry)

可以看到在 create_gemm_operator 中调用了 emit 函数，这主要是为了做 profile 以选出最好的 layout，具体的代码规定在 gemm_profiler 中：

class GemmProfilerEmitter(object):
    """Emit a C++ source for profiling CUTLASS kernels."""

    def __init__(self):
        from jinja2 import Template

        self.template = Template(
            """
#include <iostream>
#include <sstream>
#include <vector>
#include <chrono>
#include "cuda_runtime.h"
#include "cutlass/gemm/device/gemm.h"
#define CUTLASS_CHECK(status)                                                                    \\
  {                                                                                              \\
    cutlass::Status error = status;                                                              \\
    if (error != cutlass::Status::kSuccess) {                                                    \\
      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \\
                << std::endl;                                                                    \\
      exit(EXIT_FAILURE);                                                                        \\
    }                                                                                            \\
  }
#define CUDA_CHECK(status)                                              \\
  {                                                                     \\
    cudaError_t error = status;                                         \\
    if (error != cudaSuccess) {                                         \\
      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \\
                << " at line: " << __LINE__ << std::endl;               \\
      exit(EXIT_FAILURE);                                               \\
    }                                                                   \\
  }
template<typename DTypeA, typename DTypeB, typename DTypeC>
cudaError_t CutlassGemmRCR(
    int M,
    int N,
    int K,
    DTypeC alpha,
    DTypeA const *A,
    int lda,
    DTypeB const *B,
    int ldb,
    DTypeC beta,
    DTypeC *C,
    int ldc) {
  using namespace std::chrono;
  {{OperatorDef}}
  Operation_{{OperatorName}} gemm_operator;
  Operation_{{OperatorName}}::Arguments args({M, N, K},
                              {A, lda},
                              {B, ldb},
                              {C, ldc},
                              {C, ldc},
                              {alpha, beta});
  cutlass::Status status = gemm_operator(args);
  CUTLASS_CHECK(status)
  high_resolution_clock::time_point t1 = high_resolution_clock::now();
  for (int i = 0; i < 100; ++i) {
    status = gemm_operator(args);
  }
  cudaDeviceSynchronize();
  high_resolution_clock::time_point t2 = high_resolution_clock::now();
  duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
  std::cout << time_span.count() << std::endl;
  return cudaSuccess;
}
template<typename DType>
cudaError_t AllocateMatrix(DType **matrix, int ldm, int rows, int columns, int seed = 0) {
  cudaError_t result;
  size_t sizeof_matrix = sizeof(DType) * rows * columns;
  // Allocate device memory.
  result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
  if (result != cudaSuccess) {
    std::cerr << "Failed to allocate matrix: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }
  // Clear the allocation.
  result = cudaMemset(*matrix, 0, sizeof_matrix);
  if (result != cudaSuccess) {
    std::cerr << "Failed to clear matrix device memory: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }
  if (result != cudaSuccess) {
    std::cerr << "Failed to initialize matrix: "
      << cudaGetErrorString(result) << std::endl;
    return result;
  }
  return result;
}
template<typename DTypeA, typename DTypeB, typename DTypeC>
cudaError_t TestCutlassGemm(int M, int N, int K, DTypeC alpha, DTypeC beta) {
  cudaError_t result;
  {{LeadingDim}}
  // size_t sizeof_C = sizeof(DTypeC) * ldc * N;
  DTypeA *A;
  DTypeB *B;
  DTypeC *C_cutlass;
  result = AllocateMatrix<DTypeA>(&A, lda, M, K, 0);
  if (result !=  cudaSuccess) {
    return result;
  }
  result = AllocateMatrix<DTypeB>(&B, ldb, K, N, 17);
  if (result !=  cudaSuccess) {
    cudaFree(A);
    return result;
  }
  result = AllocateMatrix<DTypeC>(&C_cutlass, ldc, M, N, 101);
  if (result != cudaSuccess) {
    cudaFree(A);
    cudaFree(B);
    return result;
  }
  result = CutlassGemmRCR<DTypeA, DTypeB, DTypeC>(M, N, K, alpha, A, lda, B, ldb,
                                                  beta, C_cutlass, ldc);
  if (result != cudaSuccess) {
    std::cerr << "CUTLASS GEMM kernel failed: "
      << cudaGetErrorString(result) << std::endl;
    cudaFree(C_cutlass);
    cudaFree(B);
    cudaFree(A);
    return result;
  }
  cudaFree(C_cutlass);
  cudaFree(B);
  cudaFree(A);
  return cudaSuccess;
}
int main(int argc, const char *arg[]) {
  int problem[3] = { 4096, 4096, 4096 };
  for (int i = 1; i < argc && i < 4; ++i) {
    std::stringstream ss(arg[i]);
    ss >> problem[i - 1];
  }
  float scalars[2] = { 1, 0 };
  cudaError_t result = TestCutlassGemm< {{DTypeA}}, {{DTypeB}}, {{DTypeC}}>(
    problem[0],     // GEMM M dimension
    problem[1],     // GEMM N dimension
    problem[2],     // GEMM K dimension
    static_cast<{{DTypeC}}>(scalars[0]),     // alpha
    static_cast<{{DTypeC}}>(scalars[1])      // beta
  );
  return result == cudaSuccess ? 0 : -1;
}
"""
        )

    def emit(self, op_name, op_def, dtype_a, dtype_b, dtype_c, ld):
        src = self.template.render(
            OperatorName=op_name,
            OperatorDef=op_def,
            DTypeA=dtype_a,
            DTypeB=dtype_b,
            DTypeC=dtype_c,
            LeadingDim=ld,
        )
        return src

可以看到首先规定了一些做 profile 所需要的性能评估的模版，随后基于参数做实际的生成并作性能评估。同时在 gen_gemm.py 中规定了如何做 profile 并取出性能最好的参数：

class ProfilerEngine(object):
    """Compile and run a given profiler executable."""

    def __init__(self, cuda_arch, cutlass_path, binary_prefix):
        self.cuda_arch = cuda_arch
        self.binary_prefix = binary_prefix
        self.cutlass = cutlass_path
        self.cflags = "-I{cutlass}/include -I{cutlass}/tools/util/include -O3 -std=c++11".format(
            cutlass=cutlass_path
        )
        self.cflags += " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1"
        self.cflags += " -gencode=arch=compute_{arch},code=[sm_{arch},compute_{arch}]".format(
            arch=cuda_arch
        )
        self.cflags += " -Xcompiler=-Wconversion -Xcompiler=-fno-strict-aliasing"
        self.cmd = "nvcc {cflags} {src} -o {output}"

    def _compile(self, op):
        os.makedirs(self.binary_prefix, exist_ok=True)
        opath = os.path.join(self.binary_prefix, op["name"])
        if os.path.exists(opath):
            return
        fi = tempfile.NamedTemporaryFile("w", delete=False, suffix=".cu")
        fi.write(op["src"])
        fi.close()
        cmd = self.cmd.format(cflags=self.cflags, src=fi.name, output=opath)
        os.system(cmd)
        os.unlink(fi.name)

    def compile_all(self, ops, use_multiprocessing=False):
        """Compile all profiler executables."""
        if use_multiprocessing:
            pool = multiprocessing.Pool(multiprocessing.cpu_count())
            pool.map(self._compile, ops)
        else:
            for op in ops:
                self._compile(op)

    def evaluate(self, op_name, args):
        """Run the profiler executable corresponding to op_name with args."""
        opath = os.path.join(self.binary_prefix, op_name)
        cmd = [opath]
        if args is not None:
            cmd.append(str(args[0]))
            cmd.append(str(args[1]))
            cmd.append(str(args[2]))
            if len(args) > 3:
                cmd.append(str(args[3]))
        try:
            sp = subprocess.run(cmd, capture_output=True, check=True)
            rt = float(sp.stdout)
            print(op_name, rt)
        except subprocess.CalledProcessError:
            rt = -1
        return rt


class CutlassGemmProfiler(object):
    """Profile all candidate kernels and select the best one."""

    def __init__(self, sm, cutlass_path, binary_path):
        assert sm in GENERATOR_FUNC_TABLE, "sm%d not supported yet." % sm
        self.engine = ProfilerEngine(sm, cutlass_path, binary_path)
        self.sm = sm

    def check_align(self, op_name, M):
        """Filter out kernels that cannot be supported."""
        aligns = re.findall(r"align[1|2|4|8]", op_name)
        assert len(aligns) == 1
        align = int(aligns[0][-1])
        if M % align != 0:
            return False
        return True

    def profile(self, M, N, K, out_dtype, profile_all=True, use_multiprocessing=False):
        """Profile and select the best kernel from candidate kernels.
        If profile_all is False, return immediately after the first applicable kernel is found.
        If use_multiprocessing is True, compile all profiler executables in parallel.
        """
        ops = GENERATOR_FUNC_TABLE[self.sm](out_dtype)
        ops = list(filter(lambda op: self.check_align(op["name"], M), ops))

        for op in ops:
            op["runtime"] = -1

        self.engine.compile_all(ops, use_multiprocessing)

        for op in ops:
            out = self.engine.evaluate(op["name"], [M, N, K])
            op["runtime"] = out
            if out > 0 and profile_all is False:
                break

        valid_ops = filter(lambda op: op["runtime"] > 0, ops)
        output = sorted(valid_ops, key=lambda i: i["runtime"])
        return output[0]

这个函数被 build.py 调用并生成性能最好的参数：

def tune_cutlass_kernels(mod, sm, profile_all=True, use_multiprocessing=False, tmp_dir="./tmp"):
    """Given a module partitioned for CUTLASS offloading, profile each workload to select which
    kernels to emit.
    Parameters
    ----------
    mod : IRModule
        The Relay module with cutlass partitions.
    sm : int
        An integer specifying the compute capability. For example, 75 for Turing and
        80 or 86 for Ampere.
    profile_all : bool
        Whether or not profile all candidate kernels, or stop profiling after
        the first applicable kernel is found.
    use_multiprocessing : bool
        Whether or not compile profiler executables for different kernels in parallel.
    tmp_dir : string, optional
        A temporary directory where intermediate compiled artifacts will be stored.
    Returns
    -------
    mod : IRModule
        The updated module annotated with cutlass profiling information.
    num_cutlass_partition : int
        The number of partitioned functions created for CUTLASS.
    """
    cutlass_profiler = CutlassGemmProfiler(sm, "../../../3rdparty/cutlass", tmp_dir)
    num_cutlass_partition = 0
    for var in mod.get_global_vars():
        fun_name = var.name_hint
        func = mod[fun_name]
        annotator = GemmAnnotator()
        if "cutlass" in fun_name:
            num_cutlass_partition += 1
            annotator.visit(func)
            # call cutlass profiler to find best settings, update attr
            new_attrs = {}
            new_attrs.update(annotator.signature)
            for key in func.attrs.keys():
                new_attrs[key] = func.attrs[key]
            # call profiler
            arg0_shape = new_attrs["arg0_shape"]
            arg1_shape = new_attrs["arg1_shape"]
            MM = arg0_shape[0]
            KK = arg0_shape[1]
            NN = arg1_shape[0]
            out = cutlass_profiler.profile(
                MM, NN, KK, annotator.signature["ret_dtype"], profile_all, use_multiprocessing
            )
            if new_attrs["op_type"] == "cutlass.dense":
                new_attrs["cutlass_op_def"] = out["opdef"]
            elif new_attrs["op_type"] == "cutlass.dense_bias":
                new_attrs["cutlass_op_def"] = out["opdef_bias"]
            elif new_attrs["op_type"] == "cutlass.dense_bias_relu":
                new_attrs["cutlass_op_def"] = out["opdef_bias_relu"]
            elif "cutlass.dense_bias_gelu" in new_attrs["op_type"]:
                new_attrs["cutlass_op_def"] = out["opdef_bias_gelu"]
            else:
                raise ValueError("%s pattern is not implemented." % new_attrs["op_type"])
            new_attrs["cutlass_op_name"] = out["name"]

            print("The best kernel is " + new_attrs["cutlass_op_name"])
            if new_attrs["cutlass_op_name"].find("_tn_align") > 0:
                new_attrs["lda"] = "K"
                new_attrs["ldb"] = "K"
                new_attrs["ldc"] = "N"
            elif new_attrs["cutlass_op_name"].find("_nt_align") > 0:
                new_attrs["lda"] = "M"
                new_attrs["ldb"] = "N"
                new_attrs["ldc"] = "N"
            else:
                raise ValueError("%s unsupported operation" % new_attrs["cutlass_op_name"])
            new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs)
            new_func = relay.Function(
                func.params,
                func.body,
                ret_type=func.ret_type,
                type_params=func.type_params,
                attrs=new_attrs,
            )
            mod.update_func(var, new_func)

    return mod, num_cutlass_partition


def build_cutlass_kernels(lib, sm, tmp_dir="./tmp", lib_path="compile.so"):
    """Compile CUTLASS kernels in lib and return the runtime module ready to run.
    Parameters
    ----------
    lib : GraphExecutorFactoryModule
        The output from relay.build containing compiled host code and non-cutlass kernels.
    sm : int
        An integer specifying the compute capability. For example, 75 for Turing and
        80 or 86 for Ampere.
    tmp_dir : string, optional
        A temporary directory where intermediate compiled artifacts will be stored.
    lib_path : string, optional
        The path to a shared library which will be generated as the result of the build  process
    Returns
    -------
    updated_lib : runtime.Module
        The updated module with compiled cutlass kernels.
    """
    cutlass_path = "../../../3rdparty/cutlass/include"
    cutlass_util_path = "../../../3rdparty/cutlass/tools/util/include"

    kwargs = {}
    kwargs["cc"] = "nvcc"
    kwargs["options"] = [
        "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
        "-gencode=arch=compute_%d,code=[sm_%d,compute_%d]" % (sm, sm, sm),
        "-Xcompiler=-fPIC",
        "-Xcompiler=-Wconversion",
        "-Xcompiler=-fno-strict-aliasing",
        "-O3",
        "-std=c++14",
        "-I" + cutlass_path,
        "-I" + cutlass_util_path,
    ]
    lib.export_library(lib_path, workspace_dir=tmp_dir, **kwargs)
    return runtime.load_module(lib_path)

GEMM Optimization

Sliced-K
Split-K
Stream-K
MultiStage

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Proposal] Codegen Design #62

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 0 comments

Select a reply

[Proposal] Codegen Design #62

KuangjuX Dec 29, 2023 Collaborator

代码生成设计提案

基于子图的生成

原语设计

Tensor Core

Cute

Cutlass

Performance Evluation

GEMM Optimization

Replies: 0 comments

KuangjuX
Dec 29, 2023
Collaborator