From 4250be80f52531fda3c72542c3e351c5ad67b4a5 Mon Sep 17 00:00:00 2001 From: Simeon David Schaub Date: Thu, 9 Jan 2025 11:37:29 +0100 Subject: [PATCH] add detection for zen 5 (#56967) ref https://github.com/llvm/llvm-project/commit/149a150b50c112e26fc5acbdd58250c44ccd777f --------- Co-authored-by: gbaraldi --- src/features_x86.h | 28 +++++++++++++++++++++++++++- src/processor_x86.cpp | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/features_x86.h b/src/features_x86.h index 2ecc8fee32a38..b6e2b23985b4f 100644 --- a/src/features_x86.h +++ b/src/features_x86.h @@ -5,6 +5,13 @@ #else #define JL_X86_64ONLY_VER(x) x #endif +// The code is similar to what is here so the bits can be used as reference +// https://github.com/llvm/llvm-project/blob/3f7905733820851bc4f65cb4af693c3101cbf20d/llvm/lib/TargetParser/Host.cpp#L1257 + +// The way the bits here work is an index into the features array. This is a bit array +// The index works as follows: +// 32*i + j where i is the index into the array and j is the bit in the array. +// There is a reference to what each index corresponds to in _get_host_cpu // X86 features definition // EAX=1: ECX @@ -79,6 +86,7 @@ JL_FEATURE_DEF(avx512vp2intersect, 32 * 4 + 8, 0) JL_FEATURE_DEF(serialize, 32 * 4 + 14, 110000) JL_FEATURE_DEF(tsxldtrk, 32 * 4 + 16, 110000) JL_FEATURE_DEF(pconfig, 32 * 4 + 18, 0) +// JL_FEATURE_DEF(ibt, 32 * 4 + 20, 0) JL_FEATURE_DEF_NAME(amx_bf16, 32 * 4 + 22, 110000, "amx-bf16") JL_FEATURE_DEF(avx512fp16, 32 * 4 + 23, 140000) JL_FEATURE_DEF_NAME(amx_tile, 32 * 4 + 24, 110000, "amx-tile") @@ -110,10 +118,28 @@ JL_FEATURE_DEF(clzero, 32 * 8 + 0, 0) JL_FEATURE_DEF(wbnoinvd, 32 * 8 + 9, 0) // EAX=7,ECX=1: EAX +JL_FEATURE_DEF(sha512, 32 * 9 + 0, 170000) +JL_FEATURE_DEF(sm3, 32 * 9 + 1, 170000) +JL_FEATURE_DEF(sm4, 32 * 9 + 2, 170000) +JL_FEATURE_DEF(raoint, 32 * 9 + 3, 170000) JL_FEATURE_DEF(avxvnni, 32 * 9 + 4, 120000) JL_FEATURE_DEF(avx512bf16, 32 * 9 + 5, 0) +JL_FEATURE_DEF(cmpccxadd, 32 * 9 + 7, 160000) +JL_FEATURE_DEF_NAME(amx_fp16, 32 * 9 + 21, 160000, "amx-fp16") +JL_FEATURE_DEF(hreset, 32 * 9 + 22, 160000) +JL_FEATURE_DEF(avxifma, 32 * 9 + 23, 160000) + +// EAX=7,ECX=1: EBX +JL_FEATURE_DEF(avxvnniint8, 32 * 10 + 4, 160000) +JL_FEATURE_DEF(avxneconvert, 32 * 10 + 5, 160000) +JL_FEATURE_DEF_NAME(amx_complex, 32 * 10 + 8, 170000, "amx-complex") +JL_FEATURE_DEF(avxvnniint16, 32 * 10 + 10, 170000) +JL_FEATURE_DEF(prefetchi, 32 * 10 + 14, 160000) +JL_FEATURE_DEF(usermsr, 32 * 10 + 15, 170000) +// JL_FEATURE_DEF(avx10, 32 * 10 + 19, 170000) // TODO: What to do about avx10 and it's mess? +// JL_FEATURE_DEF(apxf, 32 * 10 + 21, 190000) // EAX=0x14,ECX=0: EBX -JL_FEATURE_DEF(ptwrite, 32 * 10 + 4, 0) +JL_FEATURE_DEF(ptwrite, 32 * 11 + 4, 0) #undef JL_X86_64ONLY_VER diff --git a/src/processor_x86.cpp b/src/processor_x86.cpp index f1dff063de1d9..bf765be160ed2 100644 --- a/src/processor_x86.cpp +++ b/src/processor_x86.cpp @@ -96,9 +96,10 @@ enum class CPU : uint32_t { amd_znver2, amd_znver3, amd_znver4, + amd_znver5, }; -static constexpr size_t feature_sz = 11; +static constexpr size_t feature_sz = 12; static constexpr FeatureName feature_names[] = { #define JL_FEATURE_DEF(name, bit, llvmver) {#name, bit, llvmver}, #define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) {str, bit, llvmver}, @@ -141,6 +142,10 @@ static constexpr FeatureDep deps[] = { {vpclmulqdq, avx}, {vpclmulqdq, pclmul}, {avxvnni, avx2}, + {avxvnniint8, avx2}, + {avxvnniint16, avx2}, + {avxifma, avx2}, + {avxneconvert, avx2}, {avx512f, avx2}, {avx512dq, avx512f}, {avx512ifma, avx512f}, @@ -159,6 +164,8 @@ static constexpr FeatureDep deps[] = { {avx512fp16, avx512vl}, {amx_int8, amx_tile}, {amx_bf16, amx_tile}, + {amx_fp16, amx_tile}, + {amx_complex, amx_tile}, {sse4a, sse3}, {xop, fma4}, {fma4, avx}, @@ -166,6 +173,9 @@ static constexpr FeatureDep deps[] = { {xsaveopt, xsave}, {xsavec, xsave}, {xsaves, xsave}, + {sha512, avx2}, + {sm3, avx}, + {sm4, avx2}, }; // We require cx16 on 64bit by default. This can be overwritten with `-cx16` @@ -236,6 +246,7 @@ constexpr auto znver2 = znver1 | get_feature_masks(clwb, rdpid, wbnoinvd); constexpr auto znver3 = znver2 | get_feature_masks(shstk, pku, vaes, vpclmulqdq); constexpr auto znver4 = znver3 | get_feature_masks(avx512f, avx512cd, avx512dq, avx512bw, avx512vl, avx512ifma, avx512vbmi, avx512vbmi2, avx512vnni, avx512bitalg, avx512vpopcntdq, avx512bf16, gfni, shstk, xsaves); +constexpr auto znver5 = znver4 | get_feature_masks(avxvnni, movdiri, movdir64b, avx512vp2intersect, prefetchi, avxvnni); } @@ -298,6 +309,7 @@ static constexpr CPUSpec cpus[] = { {"znver2", CPU::amd_znver2, CPU::generic, 0, Feature::znver2}, {"znver3", CPU::amd_znver3, CPU::amd_znver2, 120000, Feature::znver3}, {"znver4", CPU::amd_znver4, CPU::amd_znver3, 160000, Feature::znver4}, + {"znver5", CPU::amd_znver5, CPU::amd_znver4, 190000, Feature::znver5}, }; static constexpr size_t ncpu_names = sizeof(cpus) / sizeof(cpus[0]); @@ -575,6 +587,9 @@ static CPU get_amd_processor_name(uint32_t family, uint32_t model, const uint32_ return CPU::amd_znver4; } return CPU::amd_znver3; // fallback + case 26: + // if (model <= 0x77) + return CPU::amd_znver5; } } @@ -660,11 +675,12 @@ static NOINLINE std::pair> _get_host_cpu(void) int32_t info7[4]; jl_cpuidex(info7, 7, 1); features[9] = info7[0]; + features[10] = info7[1]; } if (maxleaf >= 0x14) { int32_t info14[4]; jl_cpuidex(info14, 0x14, 0); - features[10] = info14[1]; + features[11] = info14[1]; } // Fix up AVX bits to account for OS support and match LLVM model @@ -705,7 +721,20 @@ static NOINLINE std::pair> _get_host_cpu(void) else { cpu = uint32_t(CPU::generic); } - + /* Feature bits to register map + feature[0] = ecx + feature[1] = edx + feature[2] = leaf 7 ebx + feature[3] = leaf 7 ecx + feature[4] = leaf 7 edx + feature[5] = leaf 0x80000001 ecx + feature[6] = leaf 0x80000001 edx + feature[7] = leaf 0xd subleaf 1 eax + feature[8] = leaf 0x80000008 ebx + feature[9] = leaf 7 ebx subleaf 1 eax + feature[10] = leaf 7 ebx subleaf 1 ebx + feature[11] = leaf 0x14 ebx + */ return std::make_pair(cpu, features); }