From 56e79d457054ec142e3440cebd5099a91c4308cb Mon Sep 17 00:00:00 2001 From: Christopher Serr Date: Sun, 14 Jul 2024 13:11:52 +0200 Subject: [PATCH] Support WebAssembly Relaxed SIMD This makes use of the [WebAssembly Relaxed SIMD](https://github.com/WebAssembly/relaxed-simd) instructions, trade off consistent results across architectures for certain edge cases to gain better performance. These differing edge case behaviors already exist in the native equivalents that are used in the SIMD implementation, so this does not regress any correctness. --- .github/workflows/main.yml | 5 +++++ Cargo.toml | 4 ++-- src/lib.rs | 1 + src/wide/f32x4_t.rs | 10 ++++++++++ src/wide/f32x8_t.rs | 16 +++++++++++----- src/wide/i32x4_t.rs | 2 ++ 6 files changed, 31 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1646867..2f6d1f1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -84,6 +84,11 @@ jobs: RUSTFLAGS: -Ctarget-feature=+simd128,+bulk-memory,+nontrapping-fptoint,+sign-ext run: cargo test --target wasm32-wasi + - name: Run tests with Relaxed SIMD + env: + RUSTFLAGS: -Ctarget-feature=+simd128,+relaxed-simd,+bulk-memory,+nontrapping-fptoint,+sign-ext + run: cargo test --target wasm32-wasi + aarch64: runs-on: ubuntu-20.04 steps: diff --git a/Cargo.toml b/Cargo.toml index 8e3f7c6..b645a4d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,8 +31,8 @@ default = ["std", "simd", "png-format"] std = ["tiny-skia-path/std"] no-std-float = ["tiny-skia-path/no-std-float"] -# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128) -# and AArch64 (Neon). +# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128, +# Relaxed SIMD) and AArch64 (Neon). # Has no effect on other targets. Present mainly for testing. simd = [] diff --git a/src/lib.rs b/src/lib.rs index 718b4ed..0637a23 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -25,6 +25,7 @@ See the `examples/` directory for usage examples. #![allow(clippy::needless_range_loop)] #![allow(clippy::too_many_arguments)] #![allow(clippy::wrong_self_convention)] +#![feature(stdarch_wasm_relaxed_simd)] #[cfg(not(any(feature = "std", feature = "no-std-float")))] compile_error!("You have to activate either the `std` or the `no-std-float` feature."); diff --git a/src/wide/f32x4_t.rs b/src/wide/f32x4_t.rs index 21d5140..5985203 100644 --- a/src/wide/f32x4_t.rs +++ b/src/wide/f32x4_t.rs @@ -90,6 +90,8 @@ impl f32x4 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse2"))] { Self(unsafe { _mm_max_ps(self.0, rhs.0) }) + } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] { + Self(f32x4_relaxed_max(self.0, rhs.0)) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_pmax(self.0, rhs.0)) } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { @@ -111,6 +113,8 @@ impl f32x4 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse2"))] { Self(unsafe { _mm_min_ps(self.0, rhs.0) }) + } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] { + Self(f32x4_relaxed_min(self.0, rhs.0)) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(f32x4_pmin(self.0, rhs.0)) } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { @@ -245,6 +249,8 @@ impl f32x4 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] { Self(unsafe { _mm_blendv_ps(f.0, t.0, self.0) }) + } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] { + Self(i32x4_relaxed_laneselect(t.0, f.0, self.0)) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_bitselect(t.0, f.0, self.0)) } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { @@ -302,6 +308,8 @@ impl f32x4 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse2"))] { i32x4(unsafe { _mm_cvtps_epi32(self.0) }) + } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] { + i32x4(i32x4_relaxed_trunc_f32x4(self.round().0)) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { i32x4(i32x4_trunc_sat_f32x4(self.round().0)) } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { @@ -325,6 +333,8 @@ impl f32x4 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse2"))] { i32x4(unsafe { _mm_cvttps_epi32(self.0) }) + } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] { + i32x4(i32x4_relaxed_trunc_f32x4(self.0)) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { i32x4(i32x4_trunc_sat_f32x4(self.0)) } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] { diff --git a/src/wide/f32x8_t.rs b/src/wide/f32x8_t.rs index df79279..231af49 100644 --- a/src/wide/f32x8_t.rs +++ b/src/wide/f32x8_t.rs @@ -43,11 +43,17 @@ impl f32x8 { } pub fn floor(self) -> Self { - let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8()); - roundtrip - - roundtrip - .cmp_gt(self) - .blend(f32x8::splat(1.0), f32x8::default()) + cfg_if::cfg_if! { + if #[cfg(all(feature = "simd", target_feature = "simd128"))] { + Self(self.0.floor(), self.1.floor()) + } else { + let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8()); + roundtrip + - roundtrip + .cmp_gt(self) + .blend(f32x8::splat(1.0), f32x8::default()) + } + } } pub fn fract(self) -> Self { diff --git a/src/wide/i32x4_t.rs b/src/wide/i32x4_t.rs index fb77a0f..810454d 100644 --- a/src/wide/i32x4_t.rs +++ b/src/wide/i32x4_t.rs @@ -56,6 +56,8 @@ impl i32x4 { cfg_if::cfg_if! { if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] { Self(unsafe { _mm_blendv_epi8(f.0, t.0, self.0) }) + } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] { + Self(i32x4_relaxed_laneselect(t.0, f.0, self.0)) } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] { Self(v128_bitselect(t.0, f.0, self.0)) } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {