diff --git a/compiler/plugins/target/ROCM/builtins/tuning/test/spec_gfx942.mlir b/compiler/plugins/target/ROCM/builtins/tuning/test/spec_gfx942.mlir index b63ddc9f49d6..908f78192b84 100644 --- a/compiler/plugins/target/ROCM/builtins/tuning/test/spec_gfx942.mlir +++ b/compiler/plugins/target/ROCM/builtins/tuning/test/spec_gfx942.mlir @@ -4,6 +4,12 @@ // RUN: --iree-codegen-notify-transform-strategy-application \ // RUN: --verify-diagnostics %s | FileCheck %s +// RUN: iree-opt --split-input-file --iree-gpu-test-target=mi300x@hip \ +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-configure-target-executable-variants{target=rocm})))" \ +// RUN: --iree-codegen-enable-default-tuning-specs \ +// RUN: --iree-codegen-notify-transform-strategy-application \ +// RUN: --verify-diagnostics %s | FileCheck %s --check-prefix=MI300X + // Check that the default configuration for mmt_2048x1280x5120_f16_f16_f32 // applies to the `linalg.matmul_transpose_b` below. @@ -11,6 +17,10 @@ // CHECK: linalg.generic // CHECK-SAME: __tuning_spec_applied__ +// MI300X-LABEL: func.func @mmt_2048x1280x5120_f16_f16_f32 +// MI300X: linalg.generic +// MI300X-SAME: __tuning_spec_applied__ + #pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, diff --git a/compiler/plugins/target/ROCM/test/target_device_features.mlir b/compiler/plugins/target/ROCM/test/target_device_features.mlir index 741235767885..a4c8679ada34 100644 --- a/compiler/plugins/target/ROCM/test/target_device_features.mlir +++ b/compiler/plugins/target/ROCM/test/target_device_features.mlir @@ -21,9 +21,9 @@ // GFX942-SAME: subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], // GFX942-SAME: max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, // GFX942-SAME: max_workgroup_counts = [2147483647, 2147483647, 2147483647], -// MI300X: chip = > -// MI300A: chip = > -// MI308X: chip = > +// MI300X: chip = > +// MI300A: chip = > +// MI308X: chip = > // GFX941: target = #iree_gpu.target fetchDefaultTuningSpec(StringRef identifier) { + std::string tuningSpecName = + llvm::formatv("iree_default_tuning_spec_{}.mlir", identifier); + std::optional tuningSpecSource; + + EmbeddedDataDirectory::withGlobal([&](EmbeddedDataDirectory &dir) { + tuningSpecSource = dir.getFile(tuningSpecName); + }); + + return tuningSpecSource; +} + static FailureOr getDefaultTuningSpec(ModuleOp module, IREE::Codegen::IREECodegenDialect &dialect) { @@ -123,14 +135,29 @@ getDefaultTuningSpec(ModuleOp module, return failure(); } - // Try to look up the default tuning spec for this architecture, if any. - StringRef arch = gpuTarget.getArch(); - std::string defaultTuningSpecName = - llvm::formatv("iree_default_tuning_spec_{}.mlir", arch); + std::optional sku; + if (IREE::GPU::TargetChipAttr chip = gpuTarget.getChip()) { + if (StringAttr chipSku = chip.getSku()) { + sku = chipSku.getValue(); + } + } + + std::string defaultTuningSpecName; std::optional defaultTuningSpecSource; - EmbeddedDataDirectory::withGlobal([&](EmbeddedDataDirectory &dir) { - defaultTuningSpecSource = dir.getFile(defaultTuningSpecName); - }); + if (sku) { + // GPUs with the same ISA may have different hardware characteristics such + // as the number of workgroup processors and power limits, Look up + // SKU-specific tuning spec for optimal performance. + defaultTuningSpecSource = fetchDefaultTuningSpec(*sku); + } + + if (!defaultTuningSpecSource) { + // If SKU-specific spec is not found, fall back to the default + // architecture-based tuning spec to ensure broader compatibility. + StringRef arch = gpuTarget.getArch(); + defaultTuningSpecSource = fetchDefaultTuningSpec(arch); + } + if (!defaultTuningSpecSource) { // Not all architectures are expected to provide default tuning specs, so // this shouldn't be considered a hard error (but that's up to the caller). diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index c4dbc008ac04..c897ce19bdd9 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -416,6 +416,8 @@ def IREEGPU_TargetChipAttr : AttrDef { let parameters = (ins "uint32_t":$wgp_count, + // An optional SKU identifier to distinguish different models. + OptionalParameter<"StringAttr">:$sku, // An optional extra dict // This field allows to inject more features/limits not supported in the // above list for better flexibility. diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp index c347e28f2969..0d19c51e33ee 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp @@ -54,6 +54,7 @@ struct WgpDetails { // Chip level feature/limit details struct ChipDetails { uint32_t wgpCount; + std::optional sku; }; // Full target details @@ -116,9 +117,13 @@ TargetAttr createTargetAttr(const TargetDetails &details, StringRef arch, DictionaryAttr{}); TargetChipAttr targetChip; - if (details.chip) - targetChip = - TargetChipAttr::get(context, details.chip->wgpCount, DictionaryAttr{}); + if (details.chip) { + auto skuAttr = details.chip->sku + ? StringAttr::get(context, *details.chip->sku) + : StringAttr{}; + targetChip = TargetChipAttr::get(context, details.chip->wgpCount, skuAttr, + DictionaryAttr{}); + } return TargetAttr::get(context, arch, features, targetWgp, targetChip); } @@ -279,28 +284,27 @@ std::optional getAMDGPUTargetDetails(StringRef target) { // "AMD Instinct MI300 Series Product Offerings" in Page 23 of // https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/white-papers/amd-cdna-3-white-paper.pdf - static const ChipDetails mi300xChip = {304}; - static const ChipDetails mi300aChip = {228}; - static const ChipDetails mi308xChip = {80}; + static const ChipDetails mi300xChip = {304, "mi300x"}; + static const ChipDetails mi300aChip = {228, "mi300a"}; + static const ChipDetails mi308xChip = {80, "mi308x"}; // "AMD Instinct MI200 Series Accelerator Product Offerings" in Page 14 of // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf - static const ChipDetails mi250xChip = {220}; - static const ChipDetails mi250Chip = {208}; - static const ChipDetails mi210Chip = {104}; + static const ChipDetails mi250xChip = {220, "mi250x"}; + static const ChipDetails mi250Chip = {208, "mi250"}; + static const ChipDetails mi210Chip = {104, "mi210"}; // "AMD CDNA Architecture Compute Units" in Page 5 of // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna-white-paper.pdf - static const ChipDetails mi100Chip = {120}; + static const ChipDetails mi100Chip = {120, "mi100"}; - static const ChipDetails rx7900xtxChip = {96}; - static const ChipDetails rx7900xtChip = {84}; - static const ChipDetails rx7800xtChip = {60}; - static const ChipDetails rx7700xtChip = {54}; + static const ChipDetails rx7900xtxChip = {96, "rx7900xtx"}; + static const ChipDetails rx7900xtChip = {84, "rx7900xt"}; + static const ChipDetails rx7800xtChip = {60, "rx7800xt"}; + static const ChipDetails rx7700xtChip = {54, "rx7700xt"}; // See https://llvm.org/docs/AMDGPUUsage.html#processors for gfxN to // cdnaN/rdnaN mapping. - return llvm::StringSwitch>(target.lower()) .Case("mi300x", TargetDetails{cdna3Wgp, &mi300xChip}) .Case("mi300a", TargetDetails{cdna3Wgp, &mi300aChip})