diff --git a/python_bindings/src/halide/halide_/PyEnums.cpp b/python_bindings/src/halide/halide_/PyEnums.cpp index 5a442bdca737..6aa83534b4b4 100644 --- a/python_bindings/src/halide/halide_/PyEnums.cpp +++ b/python_bindings/src/halide/halide_/PyEnums.cpp @@ -179,7 +179,16 @@ void define_enums(py::module &m) { .value("ARMFp16", Target::Feature::ARMFp16) .value("LLVMLargeCodeModel", Target::Feature::LLVMLargeCodeModel) .value("RVV", Target::Feature::RVV) + .value("ARMv8a", Target::Feature::ARMv8a) .value("ARMv81a", Target::Feature::ARMv81a) + .value("ARMv82a", Target::Feature::ARMv82a) + .value("ARMv83a", Target::Feature::ARMv83a) + .value("ARMv84a", Target::Feature::ARMv84a) + .value("ARMv85a", Target::Feature::ARMv85a) + .value("ARMv86a", Target::Feature::ARMv86a) + .value("ARMv87a", Target::Feature::ARMv87a) + .value("ARMv88a", Target::Feature::ARMv88a) + .value("ARMv89a", Target::Feature::ARMv89a) .value("SanitizerCoverage", Target::Feature::SanitizerCoverage) .value("ProfileByTimer", Target::Feature::ProfileByTimer) .value("SPIRV", Target::Feature::SPIRV) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 0e4de6baa050..24bd7d9e4e34 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -32,6 +32,42 @@ using namespace llvm; namespace { +// Populate feature flags in a target according to those implied by +// existing flags, so that instruction patterns can just check for the +// oldest feature flag that supports an instruction. +// +// According to LLVM, ARM architectures have the following is-a-superset-of +// relationships: +// +// v9.5a > v9.4a > v9.3a > v9.2a > v9.1a > v9a; +// v v v v v +// v8.9a > v8.8a > v8.7a > v8.6a > v8.5a > v8.4a > ... > v8a; +// +// v8r has no relation to anything. +Target complete_arm_target(Target t) { + constexpr int num_arm_v8_features = 10; + static const Target::Feature arm_v8_features[num_arm_v8_features] = { + Target::ARMv89a, + Target::ARMv88a, + Target::ARMv87a, + Target::ARMv86a, + Target::ARMv85a, + Target::ARMv84a, + Target::ARMv83a, + Target::ARMv82a, + Target::ARMv81a, + Target::ARMv8a, + }; + + for (int i = 0; i < num_arm_v8_features - 1; i++) { + if (t.has_feature(arm_v8_features[i])) { + t.set_feature(arm_v8_features[i + 1]); + } + } + + return t; +} + // Substitute in loads that feed into slicing shuffles, to help with vld2/3/4 // emission. These are commonly lifted as lets because they get used by multiple // interleaved slices of the same load. @@ -201,7 +237,7 @@ class CodeGen_ARM : public CodeGen_Posix { }; CodeGen_ARM::CodeGen_ARM(const Target &target) - : CodeGen_Posix(target) { + : CodeGen_Posix(complete_arm_target(target)) { // TODO(https://github.com/halide/Halide/issues/8088): See if // use_llvm_vp_intrinsics can replace architecture specific code in this @@ -2445,9 +2481,9 @@ string CodeGen_ARM::mcpu_target() const { } } else { if (target.os == Target::IOS) { - return "cyclone"; + return "apple-a7"; } else if (target.os == Target::OSX) { - return "apple-a12"; + return "apple-m1"; } else if (target.has_feature(Target::SVE2)) { return "cortex-x1"; } else { @@ -2465,9 +2501,36 @@ string CodeGen_ARM::mattrs() const { if (target.has_feature(Target::ARMFp16)) { attrs.emplace_back("+fullfp16"); } + if (target.has_feature(Target::ARMv8a)) { + attrs.emplace_back("+v8a"); + } if (target.has_feature(Target::ARMv81a)) { attrs.emplace_back("+v8.1a"); } + if (target.has_feature(Target::ARMv82a)) { + attrs.emplace_back("+v8.2a"); + } + if (target.has_feature(Target::ARMv83a)) { + attrs.emplace_back("+v8.3a"); + } + if (target.has_feature(Target::ARMv84a)) { + attrs.emplace_back("+v8.4a"); + } + if (target.has_feature(Target::ARMv85a)) { + attrs.emplace_back("+v8.5a"); + } + if (target.has_feature(Target::ARMv86a)) { + attrs.emplace_back("+v8.6a"); + } + if (target.has_feature(Target::ARMv87a)) { + attrs.emplace_back("+v8.7a"); + } + if (target.has_feature(Target::ARMv88a)) { + attrs.emplace_back("+v8.8a"); + } + if (target.has_feature(Target::ARMv89a)) { + attrs.emplace_back("+v8.9a"); + } if (target.has_feature(Target::ARMDotProd)) { attrs.emplace_back("+dotprod"); } @@ -2482,7 +2545,7 @@ string CodeGen_ARM::mattrs() const { } } else { // TODO: Should Halide's SVE flags be 64-bit only? - // TODO: Sound we ass "-neon" if NoNEON is set? Does this make any sense? + // TODO: Should we add "-neon" if NoNEON is set? Does this make any sense? if (target.has_feature(Target::SVE2)) { attrs.emplace_back("+sve2"); } else if (target.has_feature(Target::SVE)) { diff --git a/src/Target.cpp b/src/Target.cpp index c0cd3e9bab3a..9270033f35e9 100644 --- a/src/Target.cpp +++ b/src/Target.cpp @@ -688,7 +688,16 @@ const std::map feature_name_map = { {"arm_fp16", Target::ARMFp16}, {"llvm_large_code_model", Target::LLVMLargeCodeModel}, {"rvv", Target::RVV}, + {"armv8a", Target::ARMv8a}, {"armv81a", Target::ARMv81a}, + {"armv82a", Target::ARMv82a}, + {"armv83a", Target::ARMv83a}, + {"armv84a", Target::ARMv84a}, + {"armv85a", Target::ARMv85a}, + {"armv86a", Target::ARMv86a}, + {"armv87a", Target::ARMv87a}, + {"armv88a", Target::ARMv88a}, + {"armv89a", Target::ARMv89a}, {"sanitizer_coverage", Target::SanitizerCoverage}, {"profile_by_timer", Target::ProfileByTimer}, {"spirv", Target::SPIRV}, @@ -1258,6 +1267,40 @@ int Target::get_vulkan_capability_lower_bound() const { return 10; } +int Target::get_arm_v8_lower_bound() const { + if (has_feature(Target::ARMv8a)) { + return 80; + } + if (has_feature(Target::ARMv81a)) { + return 81; + } + if (has_feature(Target::ARMv82a)) { + return 82; + } + if (has_feature(Target::ARMv83a)) { + return 83; + } + if (has_feature(Target::ARMv84a)) { + return 84; + } + if (has_feature(Target::ARMv85a)) { + return 85; + } + if (has_feature(Target::ARMv86a)) { + return 86; + } + if (has_feature(Target::ARMv87a)) { + return 87; + } + if (has_feature(Target::ARMv88a)) { + return 88; + } + if (has_feature(Target::ARMv89a)) { + return 89; + } + return -1; +} + bool Target::supports_type(const Type &t) const { if (t.bits() == 64) { if (t.is_float()) { @@ -1461,7 +1504,7 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) // (c) must match across both targets; it is an error if one target has the feature and the other doesn't // clang-format off - const std::array union_features = {{ + const std::array union_features = {{ // These are true union features. CUDA, D3D12Compute, @@ -1482,20 +1525,32 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) CUDACapability75, CUDACapability80, CUDACapability86, + HVX_v62, HVX_v65, HVX_v66, HVX_v68, + VulkanV10, VulkanV12, VulkanV13, + + ARMv8a, + ARMv81a, + ARMv82a, + ARMv83a, + ARMv84a, + ARMv85a, + ARMv86a, + ARMv87a, + ARMv88a, + ARMv89a, }}; // clang-format on // clang-format off - const std::array intersection_features = {{ + const std::array intersection_features = {{ ARMv7s, - ARMv81a, AVX, AVX2, AVX512, @@ -1634,6 +1689,43 @@ bool Target::get_runtime_compatible_target(const Target &other, Target &result) output.features.reset(HVX_v68); } + // Pick tight lower bound for ARM capability. Use fall-through to clear redundant features + int arm_v8_a = get_arm_v8_lower_bound(); + int arm_v8_b = other.get_arm_v8_lower_bound(); + + // Same trick as above for CUDA + int arm_v8_capability = (int)std::min((unsigned)arm_v8_a, (unsigned)arm_v8_b); + if (arm_v8_capability < 80) { + output.features.reset(ARMv8a); + } + if (arm_v8_capability < 81) { + output.features.reset(ARMv81a); + } + if (arm_v8_capability < 82) { + output.features.reset(ARMv82a); + } + if (arm_v8_capability < 83) { + output.features.reset(ARMv83a); + } + if (arm_v8_capability < 84) { + output.features.reset(ARMv84a); + } + if (arm_v8_capability < 85) { + output.features.reset(ARMv85a); + } + if (arm_v8_capability < 86) { + output.features.reset(ARMv86a); + } + if (arm_v8_capability < 87) { + output.features.reset(ARMv87a); + } + if (arm_v8_capability < 88) { + output.features.reset(ARMv88a); + } + if (arm_v8_capability < 89) { + output.features.reset(ARMv89a); + } + result = output; return true; } diff --git a/src/Target.h b/src/Target.h index 7150513e6451..81da10a37126 100644 --- a/src/Target.h +++ b/src/Target.h @@ -154,7 +154,16 @@ struct Target { ARMFp16 = halide_target_feature_arm_fp16, LLVMLargeCodeModel = halide_llvm_large_code_model, RVV = halide_target_feature_rvv, + ARMv8a = halide_target_feature_armv8a, ARMv81a = halide_target_feature_armv81a, + ARMv82a = halide_target_feature_armv82a, + ARMv83a = halide_target_feature_armv83a, + ARMv84a = halide_target_feature_armv84a, + ARMv85a = halide_target_feature_armv85a, + ARMv86a = halide_target_feature_armv86a, + ARMv87a = halide_target_feature_armv87a, + ARMv88a = halide_target_feature_armv88a, + ARMv89a = halide_target_feature_armv89a, SanitizerCoverage = halide_target_feature_sanitizer_coverage, ProfileByTimer = halide_target_feature_profile_by_timer, SPIRV = halide_target_feature_spirv, @@ -335,6 +344,10 @@ struct Target { * features are set. */ int get_vulkan_capability_lower_bound() const; + /** Get the minimum ARM v8.x capability found as an integer. Returns + * -1 if no ARM v8.x features are set. */ + int get_arm_v8_lower_bound() const; + /** Was libHalide compiled with support for this target? */ bool supported() const; diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 736d64478c8b..ce3fd013ec1b 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1434,7 +1434,16 @@ typedef enum halide_target_feature_t { halide_target_feature_arm_fp16, ///< Enable ARMv8.2-a half-precision floating point data processing halide_llvm_large_code_model, ///< Use the LLVM large code model to compile halide_target_feature_rvv, ///< Enable RISCV "V" Vector Extension - halide_target_feature_armv81a, ///< Enable ARMv8.1-a instructions + halide_target_feature_armv8a, ///< Enable ARMv8a instructions + halide_target_feature_armv81a, ///< Enable ARMv8.1a instructions + halide_target_feature_armv82a, ///< Enable ARMv8.2a instructions + halide_target_feature_armv83a, ///< Enable ARMv8.3a instructions + halide_target_feature_armv84a, ///< Enable ARMv8.4a instructions + halide_target_feature_armv85a, ///< Enable ARMv8.5a instructions + halide_target_feature_armv86a, ///< Enable ARMv8.6a instructions + halide_target_feature_armv87a, ///< Enable ARMv8.7a instructions + halide_target_feature_armv88a, ///< Enable ARMv8.8a instructions + halide_target_feature_armv89a, ///< Enable ARMv8.9a instructions halide_target_feature_sanitizer_coverage, ///< Enable hooks for SanitizerCoverage support. halide_target_feature_profile_by_timer, ///< Alternative to halide_target_feature_profile using timer interrupt for systems without threads or applicartions that need to avoid them. halide_target_feature_spirv, ///< Enable SPIR-V code generation support. diff --git a/test/correctness/cross_compilation.cpp b/test/correctness/cross_compilation.cpp index 7831e2c303f7..fc18ce51f81b 100644 --- a/test/correctness/cross_compilation.cpp +++ b/test/correctness/cross_compilation.cpp @@ -21,6 +21,16 @@ int main(int argc, char **argv) { "arm-64-android", "arm-64-android-hvx", "arm-64-ios", + "arm-64-ios-armv8a", + "arm-64-ios-armv81a", + "arm-64-ios-armv82a", + "arm-64-ios-armv83a", + "arm-64-ios-armv84a", + "arm-64-ios-armv85a", + "arm-64-ios-armv86a", + "arm-64-ios-armv87a", + "arm-64-ios-armv88a", + "arm-64-ios-armv89a", "arm-64-linux", "arm-64-noos-semihosting", "arm-64-windows", diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index f386b7efc094..25b641800987 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -120,7 +120,16 @@ class SimdOpCheckTest { Target::ARMDotProd, Target::ARMFp16, Target::ARMv7s, + Target::ARMv8a, Target::ARMv81a, + Target::ARMv82a, + Target::ARMv83a, + Target::ARMv84a, + Target::ARMv85a, + Target::ARMv86a, + Target::ARMv87a, + Target::ARMv88a, + Target::ARMv89a, Target::AVX, Target::AVX2, Target::AVX512, diff --git a/test/correctness/target.cpp b/test/correctness/target.cpp index 7c8fcbe4d15f..acd468e9a8c7 100644 --- a/test/correctness/target.cpp +++ b/test/correctness/target.cpp @@ -205,6 +205,18 @@ int main(int argc, char **argv) { return 1; } + t1 = Target("arm-64-linux-armv87a-armv8a"); + t2 = Target("arm-64-linux-armv82a-armv83a"); + if (!t1.get_runtime_compatible_target(t2, t1)) { + printf("get_runtime_compatible_target failure\n"); + return 1; + } + ts = t1.to_string(); + if (ts != "arm-64-linux-armv8a") { + printf("get_runtime_compatible_target failure: %s\n", ts.c_str()); + return 1; + } + printf("Success!\n"); return 0; }