From 57f3fee28cf45f7ee075940ba9e5ec80cefb7bd7 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Thu, 16 Jan 2025 00:43:33 +1000 Subject: [PATCH] Common: Further optimize alltrue()/allfalse() on ARM --- src/common/gsvector_neon.h | 58 ++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/src/common/gsvector_neon.h b/src/common/gsvector_neon.h index c0c4642222..1eaf2fba89 100644 --- a/src/common/gsvector_neon.h +++ b/src/common/gsvector_neon.h @@ -6,6 +6,7 @@ #include #include +#include #define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1 #define GSVECTOR_HAS_SRLV 1 @@ -646,25 +647,10 @@ class alignas(16) GSVector2i ALWAYS_INLINE bool alltrue() const { - // MSB should be set in all 8-bit lanes. -#ifdef CPU_ARCH_ARM64 - return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80; -#else - return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) == - 0x80808080u); -#endif + return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF)); } - ALWAYS_INLINE bool allfalse() const - { - // MSB should be clear in all 8-bit lanes. -#ifdef CPU_ARCH_ARM64 - return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80; -#else - return ( - ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) | vget_lane_u32(vreinterpret_u32_s32(v2s), 1)) & 0x80808080u) == 0); -#endif - } + ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); } template ALWAYS_INLINE GSVector2i insert8(int a) const @@ -910,9 +896,12 @@ class alignas(16) GSVector2 return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1)); } - ALWAYS_INLINE bool alltrue() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0xFFFFFFFFFFFFFFFFULL); } + ALWAYS_INLINE bool alltrue() const + { + return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF)); + } - ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0); } + ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); } ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); } @@ -2110,23 +2099,20 @@ class alignas(16) GSVector4i ALWAYS_INLINE bool alltrue() const { - // MSB should be set in all 8-bit lanes. #ifdef CPU_ARCH_ARM64 - return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80; + return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF)); #else - const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))); - return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u); + return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == + UINT64_C(0xFFFFFFFFFFFFFFFF)); #endif } ALWAYS_INLINE bool allfalse() const { - // MSB should be clear in all 8-bit lanes. #ifdef CPU_ARCH_ARM64 - return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80; + return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0)); #else - const uint32x2_t res = vreinterpret_u32_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))); - return ((vget_lane_u32(res, 0) | vget_lane_u32(res, 1) & 0x80808080u) == 0); + return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0)); #endif } @@ -2727,13 +2713,25 @@ class alignas(16) GSVector4 ALWAYS_INLINE bool alltrue() const { - // return mask() == 0xf; - return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0; +#ifdef CPU_ARCH_ARM64 + return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF)); +#else + + return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)), + vget_high_u32(vreinterpretq_u32_f32(v4s)))), + 0) == UINT64_C(0xFFFFFFFFFFFFFFFF)); +#endif } ALWAYS_INLINE bool allfalse() const { - return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0; +#ifdef CPU_ARCH_ARM64 + return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0)); +#else + return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)), + vget_high_u32(vreinterpretq_u32_f32(v4s)))), + 0) == UINT64_C(0)); +#endif } ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }