Skip to content

Commit

Permalink
Common: Further optimize alltrue()/allfalse() on ARM
Browse files Browse the repository at this point in the history
  • Loading branch information
stenzek committed Jan 15, 2025
1 parent 0189e1e commit 57f3fee
Showing 1 changed file with 28 additions and 30 deletions.
58 changes: 28 additions & 30 deletions src/common/gsvector_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <algorithm>
#include <cmath>
#include <cstdint>

#define GSVECTOR_HAS_FAST_INT_SHUFFLE8 1
#define GSVECTOR_HAS_SRLV 1
Expand Down Expand Up @@ -646,25 +647,10 @@ class alignas(16) GSVector2i

ALWAYS_INLINE bool alltrue() const
{
// MSB should be set in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vminv_u8(vreinterpret_u8_s32(v2s)) & 0x80) == 0x80;
#else
return ((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) & vget_lane_u32(vreinterpret_u32_s32(v2s), 1) & 0x80808080u) ==
0x80808080u);
#endif
return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
}

ALWAYS_INLINE bool allfalse() const
{
// MSB should be clear in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vmaxv_u32(vreinterpret_u8_s32(v2s)) & 0x80) != 0x80;
#else
return (
((vget_lane_u32(vreinterpret_u32_s32(v2s), 0) | vget_lane_u32(vreinterpret_u32_s32(v2s), 1)) & 0x80808080u) == 0);
#endif
}
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_s32(v2s), 0) == UINT64_C(0)); }

template<int i>
ALWAYS_INLINE GSVector2i insert8(int a) const
Expand Down Expand Up @@ -910,9 +896,12 @@ class alignas(16) GSVector2
return (vget_lane_u32(masks, 0) | (vget_lane_u32(masks, 1) << 1));
}

ALWAYS_INLINE bool alltrue() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0xFFFFFFFFFFFFFFFFULL); }
ALWAYS_INLINE bool alltrue() const
{
return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
}

ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == 0); }
ALWAYS_INLINE bool allfalse() const { return (vget_lane_u64(vreinterpret_u64_f32(v2s), 0) == UINT64_C(0)); }

ALWAYS_INLINE GSVector2 replace_nan(const GSVector2& v) const { return v.blend32(*this, *this == *this); }

Expand Down Expand Up @@ -2110,23 +2099,20 @@ class alignas(16) GSVector4i

ALWAYS_INLINE bool alltrue() const
{
// MSB should be set in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vminvq_u8(vreinterpretq_u8_s32(v4s)) & 0x80) == 0x80;
return (vminvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0xFFFFFFFF));
#else
const uint32x2_t res = vreinterpret_u32_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
return ((vget_lane_u32(res, 0) & vget_lane_u32(res, 1) & 0x80808080u) == 0x80808080u);
return (vget_lane_u64(vreinterpret_u64_s32(vand_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) ==
UINT64_C(0xFFFFFFFFFFFFFFFF));
#endif
}

ALWAYS_INLINE bool allfalse() const
{
// MSB should be clear in all 8-bit lanes.
#ifdef CPU_ARCH_ARM64
return (vmaxvq_u32(vreinterpretq_u8_s32(v4s)) & 0x80) != 0x80;
return (vmaxvq_u32(vreinterpretq_u32_s32(v4s)) == UINT32_C(0));
#else
const uint32x2_t res = vreinterpret_u32_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s)));
return ((vget_lane_u32(res, 0) | vget_lane_u32(res, 1) & 0x80808080u) == 0);
return (vget_lane_u64(vreinterpret_u64_s32(vorr_s32(vget_low_s32(v4s), vget_high_s32(v4s))), 0) == UINT64_C(0));
#endif
}

Expand Down Expand Up @@ -2727,13 +2713,25 @@ class alignas(16) GSVector4

ALWAYS_INLINE bool alltrue() const
{
// return mask() == 0xf;
return ~(vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) & vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
#ifdef CPU_ARCH_ARM64
return (vminvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0xFFFFFFFF));
#else

return (vget_lane_u64(vreinterpret_u64_u32(vand_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
0) == UINT64_C(0xFFFFFFFFFFFFFFFF));
#endif
}

ALWAYS_INLINE bool allfalse() const
{
return (vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 0) | vgetq_lane_u64(vreinterpretq_u64_f32(v4s), 1)) == 0;
#ifdef CPU_ARCH_ARM64
return (vmaxvq_u32(vreinterpretq_u32_f32(v4s)) == UINT32_C(0));
#else
return (vget_lane_u64(vreinterpret_u64_u32(vorr_u32(vget_low_u32(vreinterpretq_u32_f32(v4s)),
vget_high_u32(vreinterpretq_u32_f32(v4s)))),
0) == UINT64_C(0));
#endif
}

ALWAYS_INLINE GSVector4 replace_nan(const GSVector4& v) const { return v.blend32(*this, *this == *this); }
Expand Down

0 comments on commit 57f3fee

Please sign in to comment.