From 216195fee94db325c527737d07ae994162dce14e Mon Sep 17 00:00:00 2001 From: Bangtian Liu Date: Thu, 23 Jan 2025 18:20:15 -0600 Subject: [PATCH] [Codegen][Tuner] populate the default tuning specs for mi308x Signed-off-by: Bangtian Liu --- .../target/ROCM/builtins/tuning/BUILD.bazel | 1 + .../ROCM/builtins/tuning/CMakeLists.txt | 2 + .../iree_default_tuning_spec_mi308x.mlir | 72 +++++++++++++++++++ .../ROCM/builtins/tuning/test/BUILD.bazel | 1 + .../ROCM/builtins/tuning/test/CMakeLists.txt | 1 + .../builtins/tuning/test/spec_mi308x.mlir | 46 ++++++++++++ 6 files changed, 123 insertions(+) create mode 100644 compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_mi308x.mlir create mode 100644 compiler/plugins/target/ROCM/builtins/tuning/test/spec_mi308x.mlir diff --git a/compiler/plugins/target/ROCM/builtins/tuning/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/tuning/BUILD.bazel index 00fc739578db..39b1ff683d4d 100644 --- a/compiler/plugins/target/ROCM/builtins/tuning/BUILD.bazel +++ b/compiler/plugins/target/ROCM/builtins/tuning/BUILD.bazel @@ -26,6 +26,7 @@ endif() # Target archs for tuning specs. https://llvm.org/docs/AMDGPUUsage.html#processors gpu_archs = [ "gfx942", + "mi308x", ] tuning_spec_mlir_files = [ diff --git a/compiler/plugins/target/ROCM/builtins/tuning/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/tuning/CMakeLists.txt index a04c528edc6c..50fa0ad4e4e1 100644 --- a/compiler/plugins/target/ROCM/builtins/tuning/CMakeLists.txt +++ b/compiler/plugins/target/ROCM/builtins/tuning/CMakeLists.txt @@ -19,6 +19,7 @@ iree_c_embed_data( iree_default_tuning_specs_amdgpu SRCS "iree_default_tuning_spec_gfx942.mlir" + "iree_default_tuning_spec_mi308x.mlir" C_FILE_OUTPUT "iree_default_tuning_specs_amdgpu.c" H_FILE_OUTPUT @@ -32,6 +33,7 @@ iree_lit_test_suite( verify_default_tuning_specs_amdgpu SRCS "iree_default_tuning_spec_gfx942.mlir" + "iree_default_tuning_spec_mi308x.mlir" TOOLS iree-opt ) diff --git a/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_mi308x.mlir b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_mi308x.mlir new file mode 100644 index 000000000000..cb668d59a6f9 --- /dev/null +++ b/compiler/plugins/target/ROCM/builtins/tuning/iree_default_tuning_spec_mi308x.mlir @@ -0,0 +1,72 @@ +// RUN: iree-opt %s + +// This is just an initial tuning spec for gfx942 and is not intended for +// production use. +// TODO(https://github.com/iree-org/iree/issues/19214): Add missing +// configurations to this spec. + +module @iree_default_tuning_spec_mi308x attributes { transform.with_named_sequence, iree_codegen.tuning_spec_with_default_entrypoint } { + +transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly}, + %config: !transform.any_param {transform.readonly}) { + // transform.print %op {name="Apply on"} : !transform.any_op + transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param + // Add a dummy unit attribute to be sure that the tuning spec applied. + // Otherwise it would be difficult to tell if the lowering config attribute + // comes from our tuning spec or if the compiler heuristic happened to produce + // the same config as this script. + transform.annotate %op "__tuning_spec_applied__" : !transform.any_op + transform.yield +} + +transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> !transform.any_op { + transform.match.operation_name %root ["linalg.generic"] : !transform.any_op + // transform.print %root {name = "Generic"} : !transform.any_op + %ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root { + ^bb0(%lhs: tensor, %rhs: tensor, %out: tensor): + %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, + affine_map<(d0, d1, d2) -> (d1, d2)>, + affine_map<(d0, d1, d2) -> (d0, d1)>], + iterator_types = ["parallel", "parallel", "reduction"]} + ins(%lhs, %rhs : tensor, tensor) outs(%out : tensor) { + ^bb0(%in: f16, %in_0: f16, %acc: f32): + %8 = arith.extf %in : f16 to f32 + %9 = arith.extf %in_0 : f16 to f32 + %10 = arith.mulf %8, %9 : f32 + %11 = arith.addf %acc, %10 : f32 + linalg.yield %11 : f32 + } -> tensor + } : (!transform.any_op) -> (!transform.any_value, !transform.any_value) + transform.yield %root : !transform.any_op +} + +transform.named_sequence @match_mmt_1920x1280x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) { + %mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op + %lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value + %rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value + transform.iree.match.cast_compatible_type %lhs = tensor<1920x1280xf16> : !transform.any_value + transform.iree.match.cast_compatible_type %rhs = tensor<1280x1280xf16> : !transform.any_value + %config = transform.param.constant #iree_codegen.compilation_info< + lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], + mma_kind = #iree_gpu.mma_layout, + subgroup_m_count = 4, subgroup_n_count = 2, + reduction = [0, 0, 32], + workgroup = [128, 128, 0]}>, + translation_info = #iree_codegen.translation_info, + llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"} + }>> -> !transform.any_param + transform.yield %matmul, %config : !transform.any_op, !transform.any_param +} + +transform.named_sequence +@__kernel_config(%variant_op: !transform.any_op {transform.consumed}) -> !transform.any_op + attributes { iree_codegen.tuning_spec_entrypoint } { + %res = transform.foreach_match in %variant_op + @match_mmt_1920x1280x1280 -> @apply_op_config + : (!transform.any_op) -> !transform.any_op + transform.yield %res : !transform.any_op +} + +} diff --git a/compiler/plugins/target/ROCM/builtins/tuning/test/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/tuning/test/BUILD.bazel index ab6ada2de4b1..3e5c69bcfd2e 100644 --- a/compiler/plugins/target/ROCM/builtins/tuning/test/BUILD.bazel +++ b/compiler/plugins/target/ROCM/builtins/tuning/test/BUILD.bazel @@ -26,6 +26,7 @@ iree_lit_test_suite( name = "lit", srcs = [ "spec_gfx942.mlir", + "spec_mi308x.mlir", ], cfg = "//compiler:lit.cfg.py", tools = [ diff --git a/compiler/plugins/target/ROCM/builtins/tuning/test/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/tuning/test/CMakeLists.txt index 0ed361461c04..3cc8ea4e96b8 100644 --- a/compiler/plugins/target/ROCM/builtins/tuning/test/CMakeLists.txt +++ b/compiler/plugins/target/ROCM/builtins/tuning/test/CMakeLists.txt @@ -19,6 +19,7 @@ iree_lit_test_suite( lit SRCS "spec_gfx942.mlir" + "spec_mi308x.mlir" TOOLS FileCheck iree-opt diff --git a/compiler/plugins/target/ROCM/builtins/tuning/test/spec_mi308x.mlir b/compiler/plugins/target/ROCM/builtins/tuning/test/spec_mi308x.mlir new file mode 100644 index 000000000000..4c522d2ce52b --- /dev/null +++ b/compiler/plugins/target/ROCM/builtins/tuning/test/spec_mi308x.mlir @@ -0,0 +1,46 @@ +// RUN: iree-opt --split-input-file --iree-gpu-test-target=mi308x@hip \ +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-configure-target-executable-variants{target=rocm})))" \ +// RUN: --iree-codegen-enable-default-tuning-specs \ +// RUN: --iree-codegen-notify-transform-strategy-application \ +// RUN: --verify-diagnostics %s | FileCheck %s + +// Check that the default configuration for mmt_1920x1280x1280_f16_f16_f3 +// applies to the `linalg.matmul_transpose_b` below. + +// CHECK-LABEL: func.func @mmt_1920x1280x1280_f16_f16_f3 +// CHECK: linalg.generic +// CHECK-SAME: __tuning_spec_applied__ + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @matmul_transpose_b ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + // expected-remark@+1 {{Applied transform configuration strategy @iree_default_tuning_spec_mi308x::@__kernel_config}} + func.func @mmt_1920x1280x1280_f16_f16_f32() { + %cst = arith.constant 0.000000e+00 : f16 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1920, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1920x1280xf16> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1280, 1280], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<1280x1280xf16> + %5 = tensor.empty() : tensor<1920x1280xf32> + %6 = linalg.fill ins(%cst : f16) outs(%5 : tensor<1920x1280xf32>) -> tensor<1920x1280xf32> + %7 = linalg.matmul_transpose_b + ins(%3, %4 : tensor<1920x1280xf16>, tensor<1280x1280xf16>) + outs(%6 : tensor<1920x1280xf32>) -> tensor<1920x1280xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1920, 1280], strides = [1, 1] : tensor<1920x1280xf32> -> !flow.dispatch.tensor> + return + } + } + } +}