From 56e79d457054ec142e3440cebd5099a91c4308cb Mon Sep 17 00:00:00 2001
From: Christopher Serr <christopher.serr@gmail.com>
Date: Sun, 14 Jul 2024 13:11:52 +0200
Subject: [PATCH] Support WebAssembly Relaxed SIMD

This makes use of the [WebAssembly Relaxed
SIMD](https://github.com/WebAssembly/relaxed-simd) instructions, trade
off consistent results across architectures for certain edge cases to
gain better performance. These differing edge case behaviors already
exist in the native equivalents that are used in the SIMD
implementation, so this does not regress any correctness.
---
 .github/workflows/main.yml |  5 +++++
 Cargo.toml                 |  4 ++--
 src/lib.rs                 |  1 +
 src/wide/f32x4_t.rs        | 10 ++++++++++
 src/wide/f32x8_t.rs        | 16 +++++++++++-----
 src/wide/i32x4_t.rs        |  2 ++
 6 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 1646867..2f6d1f1 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -84,6 +84,11 @@ jobs:
         RUSTFLAGS: -Ctarget-feature=+simd128,+bulk-memory,+nontrapping-fptoint,+sign-ext
       run: cargo test --target wasm32-wasi
 
+    - name: Run tests with Relaxed SIMD
+      env:
+        RUSTFLAGS: -Ctarget-feature=+simd128,+relaxed-simd,+bulk-memory,+nontrapping-fptoint,+sign-ext
+      run: cargo test --target wasm32-wasi
+
   aarch64:
     runs-on: ubuntu-20.04
     steps:
diff --git a/Cargo.toml b/Cargo.toml
index 8e3f7c6..b645a4d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -31,8 +31,8 @@ default = ["std", "simd", "png-format"]
 std = ["tiny-skia-path/std"]
 no-std-float = ["tiny-skia-path/no-std-float"]
 
-# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128)
-# and AArch64 (Neon).
+# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128,
+# Relaxed SIMD) and AArch64 (Neon).
 # Has no effect on other targets. Present mainly for testing.
 simd = []
 
diff --git a/src/lib.rs b/src/lib.rs
index 718b4ed..0637a23 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -25,6 +25,7 @@ See the `examples/` directory for usage examples.
 #![allow(clippy::needless_range_loop)]
 #![allow(clippy::too_many_arguments)]
 #![allow(clippy::wrong_self_convention)]
+#![feature(stdarch_wasm_relaxed_simd)]
 
 #[cfg(not(any(feature = "std", feature = "no-std-float")))]
 compile_error!("You have to activate either the `std` or the `no-std-float` feature.");
diff --git a/src/wide/f32x4_t.rs b/src/wide/f32x4_t.rs
index 21d5140..5985203 100644
--- a/src/wide/f32x4_t.rs
+++ b/src/wide/f32x4_t.rs
@@ -90,6 +90,8 @@ impl f32x4 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 Self(unsafe { _mm_max_ps(self.0, rhs.0) })
+            } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] {
+                Self(f32x4_relaxed_max(self.0, rhs.0))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_pmax(self.0, rhs.0))
             } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
@@ -111,6 +113,8 @@ impl f32x4 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 Self(unsafe { _mm_min_ps(self.0, rhs.0) })
+            }  else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] {
+                Self(f32x4_relaxed_min(self.0, rhs.0))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(f32x4_pmin(self.0, rhs.0))
             } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
@@ -245,6 +249,8 @@ impl f32x4 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
                 Self(unsafe { _mm_blendv_ps(f.0, t.0, self.0) })
+            } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] {
+                Self(i32x4_relaxed_laneselect(t.0, f.0, self.0))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_bitselect(t.0, f.0, self.0))
             } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
@@ -302,6 +308,8 @@ impl f32x4 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 i32x4(unsafe { _mm_cvtps_epi32(self.0) })
+            } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] {
+                i32x4(i32x4_relaxed_trunc_f32x4(self.round().0))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 i32x4(i32x4_trunc_sat_f32x4(self.round().0))
             } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
@@ -325,6 +333,8 @@ impl f32x4 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
                 i32x4(unsafe { _mm_cvttps_epi32(self.0) })
+            } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] {
+                i32x4(i32x4_relaxed_trunc_f32x4(self.0))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 i32x4(i32x4_trunc_sat_f32x4(self.0))
             } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
diff --git a/src/wide/f32x8_t.rs b/src/wide/f32x8_t.rs
index df79279..231af49 100644
--- a/src/wide/f32x8_t.rs
+++ b/src/wide/f32x8_t.rs
@@ -43,11 +43,17 @@ impl f32x8 {
     }
 
     pub fn floor(self) -> Self {
-        let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8());
-        roundtrip
-            - roundtrip
-                .cmp_gt(self)
-                .blend(f32x8::splat(1.0), f32x8::default())
+        cfg_if::cfg_if! {
+            if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
+                Self(self.0.floor(), self.1.floor())
+            } else {
+                let roundtrip: f32x8 = cast(self.trunc_int().to_f32x8());
+                roundtrip
+                    - roundtrip
+                        .cmp_gt(self)
+                        .blend(f32x8::splat(1.0), f32x8::default())
+            }
+        }
     }
 
     pub fn fract(self) -> Self {
diff --git a/src/wide/i32x4_t.rs b/src/wide/i32x4_t.rs
index fb77a0f..810454d 100644
--- a/src/wide/i32x4_t.rs
+++ b/src/wide/i32x4_t.rs
@@ -56,6 +56,8 @@ impl i32x4 {
         cfg_if::cfg_if! {
             if #[cfg(all(feature = "simd", target_feature = "sse4.1"))] {
                 Self(unsafe { _mm_blendv_epi8(f.0, t.0, self.0) })
+            } else if #[cfg(all(feature = "simd", target_feature = "relaxed-simd"))] {
+                Self(i32x4_relaxed_laneselect(t.0, f.0, self.0))
             } else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
                 Self(v128_bitselect(t.0, f.0, self.0))
             } else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {