From bd7c794f526c55b5b5d38143bb8d6d1d4b13b946 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Fri, 7 Jul 2023 04:07:00 -0700
Subject: [PATCH 01/74] Sync portable-simd to 2023 July 07

Sync up to rust-lang/portable-simd@7c7dbe0c505ccbc02ff30c1e37381ab1d47bf46f
---
 .github/workflows/ci.yml                   |  9 ++-
 crates/core_simd/src/cast.rs               | 74 +++++++++---------
 crates/core_simd/src/elements/const_ptr.rs | 30 ++++++-
 crates/core_simd/src/elements/float.rs     | 67 +++++++++++++++-
 crates/core_simd/src/elements/int.rs       | 19 ++++-
 crates/core_simd/src/elements/mut_ptr.rs   | 30 ++++++-
 crates/core_simd/src/elements/uint.rs      | 19 ++++-
 crates/core_simd/src/iter.rs               |  4 +
 crates/core_simd/src/lib.rs                |  2 +-
 crates/core_simd/src/masks.rs              |  4 +
 crates/core_simd/src/mod.rs                |  2 +
 crates/core_simd/src/ops.rs                |  8 +-
 crates/core_simd/src/ord.rs                |  4 +
 crates/core_simd/src/simd/prelude.rs       | 80 +++++++++++++++++++
 crates/core_simd/src/swizzle_dyn.rs        | 24 +++++-
 crates/core_simd/src/vector.rs             | 91 +++++-----------------
 crates/core_simd/tests/cast.rs             |  3 +-
 crates/core_simd/tests/round.rs            |  1 +
 18 files changed, 338 insertions(+), 133 deletions(-)
 create mode 100644 crates/core_simd/src/simd/prelude.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index acd47a3da72b2..1ff377fce3467 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -38,8 +38,9 @@ jobs:
           - i586-unknown-linux-gnu
           - aarch64-unknown-linux-gnu
           - armv7-unknown-linux-gnueabihf
-          - mips-unknown-linux-gnu
-          - mips64-unknown-linux-gnuabi64
+          # non-nightly since https://github.com/rust-lang/rust/pull/113274
+          # - mips-unknown-linux-gnu
+          # - mips64-unknown-linux-gnuabi64
           - powerpc-unknown-linux-gnu
           - powerpc64-unknown-linux-gnu
           - riscv64gc-unknown-linux-gnu
@@ -191,8 +192,8 @@ jobs:
           # Note: The issue above means neither of these mips targets will use
           # MSA (mips simd) but MIPS uses a nonstandard binary representation
           # for NaNs which makes it worth testing on despite that.
-          - mips-unknown-linux-gnu
-          - mips64-unknown-linux-gnuabi64
+          # - mips-unknown-linux-gnu
+          # - mips64-unknown-linux-gnuabi64
           - riscv64gc-unknown-linux-gnu
           # TODO this test works, but it appears to time out
           # - powerpc-unknown-linux-gnu
diff --git a/crates/core_simd/src/cast.rs b/crates/core_simd/src/cast.rs
index 65a3f845ffca7..1c3592f807578 100644
--- a/crates/core_simd/src/cast.rs
+++ b/crates/core_simd/src/cast.rs
@@ -1,55 +1,51 @@
 use crate::simd::SimdElement;
 
+mod sealed {
+    /// Cast vector elements to other types.
+    ///
+    /// # Safety
+    /// Implementing this trait asserts that the type is a valid vector element for the `simd_cast`
+    /// or `simd_as` intrinsics.
+    pub unsafe trait Sealed {}
+}
+use sealed::Sealed;
+
 /// Supporting trait for `Simd::cast`.  Typically doesn't need to be used directly.
-///
-/// # Safety
-/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast` or
-/// `simd_as` intrinsics.
-pub unsafe trait SimdCast: SimdElement {}
+pub trait SimdCast: Sealed + SimdElement {}
 
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i8 {}
+unsafe impl Sealed for i8 {}
+impl SimdCast for i8 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i16 {}
+unsafe impl Sealed for i16 {}
+impl SimdCast for i16 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i32 {}
+unsafe impl Sealed for i32 {}
+impl SimdCast for i32 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for i64 {}
+unsafe impl Sealed for i64 {}
+impl SimdCast for i64 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for isize {}
+unsafe impl Sealed for isize {}
+impl SimdCast for isize {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u8 {}
+unsafe impl Sealed for u8 {}
+impl SimdCast for u8 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u16 {}
+unsafe impl Sealed for u16 {}
+impl SimdCast for u16 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u32 {}
+unsafe impl Sealed for u32 {}
+impl SimdCast for u32 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for u64 {}
+unsafe impl Sealed for u64 {}
+impl SimdCast for u64 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for usize {}
+unsafe impl Sealed for usize {}
+impl SimdCast for usize {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for f32 {}
+unsafe impl Sealed for f32 {}
+impl SimdCast for f32 {}
 // Safety: primitive number types can be cast to other primitive number types
-unsafe impl SimdCast for f64 {}
-
-/// Supporting trait for `Simd::cast_ptr`.  Typically doesn't need to be used directly.
-///
-/// # Safety
-/// Implementing this trait asserts that the type is a valid vector element for the `simd_cast_ptr`
-/// intrinsic.
-pub unsafe trait SimdCastPtr<T> {}
-
-// Safety: pointers can be cast to other pointer types
-unsafe impl<T, U> SimdCastPtr<T> for *const U
-where
-    U: core::ptr::Pointee,
-    T: core::ptr::Pointee<Metadata = U::Metadata>,
-{
-}
-// Safety: pointers can be cast to other pointer types
-unsafe impl<T, U> SimdCastPtr<T> for *mut U
-where
-    U: core::ptr::Pointee,
-    T: core::ptr::Pointee<Metadata = U::Metadata>,
-{
-}
+unsafe impl Sealed for f64 {}
+impl SimdCast for f64 {}
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/elements/const_ptr.rs
index 0ef9802b5e219..f215f9a61d02e 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/elements/const_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SimdUint, SupportedLaneCount};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
@@ -9,6 +9,9 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Vector of `isize` with the same number of lanes.
     type Isize;
 
+    /// Vector of const pointers with the same number of lanes.
+    type CastPtr<T>;
+
     /// Vector of mutable pointers to the same type.
     type MutPtr;
 
@@ -18,6 +21,11 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Returns `true` for each lane that is null.
     fn is_null(self) -> Self::Mask;
 
+    /// Casts to a pointer of another type.
+    ///
+    /// Equivalent to calling [`pointer::cast`] on each lane.
+    fn cast<T>(self) -> Self::CastPtr<T>;
+
     /// Changes constness without changing the type.
     ///
     /// Equivalent to calling [`pointer::cast_mut`] on each lane.
@@ -78,6 +86,7 @@ where
 {
     type Usize = Simd<usize, LANES>;
     type Isize = Simd<isize, LANES>;
+    type CastPtr<U> = Simd<*const U, LANES>;
     type MutPtr = Simd<*mut T, LANES>;
     type Mask = Mask<isize, LANES>;
 
@@ -86,9 +95,22 @@ where
         Simd::splat(core::ptr::null()).simd_eq(self)
     }
 
+    #[inline]
+    fn cast<U>(self) -> Self::CastPtr<U> {
+        // SimdElement currently requires zero-sized metadata, so this should never fail.
+        // If this ever changes, `simd_cast_ptr` should produce a post-mono error.
+        use core::{mem::size_of, ptr::Pointee};
+        assert_eq!(size_of::<<T as Pointee>::Metadata>(), 0);
+        assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
+
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
+    }
+
     #[inline]
     fn cast_mut(self) -> Self::MutPtr {
-        self.cast_ptr()
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
     }
 
     #[inline]
@@ -106,9 +128,9 @@ where
         // In the mean-time, this operation is defined to be "as if" it was
         // a wrapping_offset, so we can emulate it as such. This should properly
         // restore pointer provenance even under today's compiler.
-        self.cast_ptr::<*const u8>()
+        self.cast::<u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
-            .cast_ptr()
+            .cast()
     }
 
     #[inline]
diff --git a/crates/core_simd/src/elements/float.rs b/crates/core_simd/src/elements/float.rs
index d602232705560..501c1c5ddd3f2 100644
--- a/crates/core_simd/src/elements/float.rs
+++ b/crates/core_simd/src/elements/float.rs
@@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdElement, SimdPartialEq, SimdPartialOrd,
+    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SimdPartialEq, SimdPartialOrd,
     SupportedLaneCount,
 };
 
@@ -15,6 +15,53 @@ pub trait SimdFloat: Copy + Sealed {
     /// Bit representation of this SIMD vector type.
     type Bits;
 
+    /// A SIMD vector with a different element type.
+    type Cast<T: SimdElement>;
+
+    /// Performs elementwise conversion of this vector's elements to another SIMD-valid type.
+    ///
+    /// This follows the semantics of Rust's `as` conversion for floats (truncating or saturating
+    /// at the limits) for each element.
+    ///
+    /// # Example
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{SimdFloat, SimdInt, Simd};
+    /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
+    /// let ints = floats.cast::<i32>();
+    /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
+    ///
+    /// // Formally equivalent, but `Simd::cast` can optimize better.
+    /// assert_eq!(ints, Simd::from_array(floats.to_array().map(|x| x as i32)));
+    ///
+    /// // The float conversion does not round-trip.
+    /// let floats_again = ints.cast();
+    /// assert_ne!(floats, floats_again);
+    /// assert_eq!(floats_again, Simd::from_array([1.0, -4.0, 2147483647.0, 0.0]));
+    /// ```
+    #[must_use]
+    fn cast<T: SimdCast>(self) -> Self::Cast<T>;
+
+    /// Rounds toward zero and converts to the same-width integer type, assuming that
+    /// the value is finite and fits in that type.
+    ///
+    /// # Safety
+    /// The value must:
+    ///
+    /// * Not be NaN
+    /// * Not be infinite
+    /// * Be representable in the return type, after truncating off its fractional part
+    ///
+    /// If these requirements are infeasible or costly, consider using the safe function [cast],
+    /// which saturates on conversion.
+    ///
+    /// [cast]: Simd::cast
+    unsafe fn to_int_unchecked<I: SimdCast>(self) -> Self::Cast<I>
+    where
+        Self::Scalar: core::convert::FloatToInt<I>;
+
     /// Raw transmutation to an unsigned integer vector type with the
     /// same size and number of lanes.
     #[must_use = "method returns a new vector and does not mutate the original value"]
@@ -206,6 +253,24 @@ macro_rules! impl_trait {
             type Mask = Mask<<$mask_ty as SimdElement>::Mask, LANES>;
             type Scalar = $ty;
             type Bits = Simd<$bits_ty, LANES>;
+            type Cast<T: SimdElement> = Simd<T, LANES>;
+
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T>
+            {
+                // Safety: supported types are guaranteed by SimdCast
+                unsafe { intrinsics::simd_as(self) }
+            }
+
+            #[inline]
+            #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
+            unsafe fn to_int_unchecked<I: SimdCast>(self) -> Self::Cast<I>
+            where
+                Self::Scalar: core::convert::FloatToInt<I>,
+            {
+                // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
+                unsafe { intrinsics::simd_cast(self) }
+            }
 
             #[inline]
             fn to_bits(self) -> Simd<$bits_ty, LANES> {
diff --git a/crates/core_simd/src/elements/int.rs b/crates/core_simd/src/elements/int.rs
index 9b8c37ed466ec..6db89ff9a659e 100644
--- a/crates/core_simd/src/elements/int.rs
+++ b/crates/core_simd/src/elements/int.rs
@@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdElement, SimdPartialOrd, SupportedLaneCount,
+    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SimdPartialOrd, SupportedLaneCount,
 };
 
 /// Operations on SIMD vectors of signed integers.
@@ -11,6 +11,16 @@ pub trait SimdInt: Copy + Sealed {
     /// Scalar type contained by this SIMD vector type.
     type Scalar;
 
+    /// A SIMD vector with a different element type.
+    type Cast<T: SimdElement>;
+
+    /// Performs elementwise conversion of this vector's elements to another SIMD-valid type.
+    ///
+    /// This follows the semantics of Rust's `as` conversion for casting integers (wrapping to
+    /// other integer types, and saturating to float types).
+    #[must_use]
+    fn cast<T: SimdCast>(self) -> Self::Cast<T>;
+
     /// Lanewise saturating add.
     ///
     /// # Examples
@@ -198,6 +208,13 @@ macro_rules! impl_trait {
         {
             type Mask = Mask<<$ty as SimdElement>::Mask, LANES>;
             type Scalar = $ty;
+            type Cast<T: SimdElement> = Simd<T, LANES>;
+
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T> {
+                // Safety: supported types are guaranteed by SimdCast
+                unsafe { intrinsics::simd_as(self) }
+            }
 
             #[inline]
             fn saturating_add(self, second: Self) -> Self {
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/elements/mut_ptr.rs
index d87986b4a091c..4bdc6a14ce4a6 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/elements/mut_ptr.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SimdUint, SupportedLaneCount};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
@@ -9,6 +9,9 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Vector of `isize` with the same number of lanes.
     type Isize;
 
+    /// Vector of const pointers with the same number of lanes.
+    type CastPtr<T>;
+
     /// Vector of constant pointers to the same type.
     type ConstPtr;
 
@@ -18,6 +21,11 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Returns `true` for each lane that is null.
     fn is_null(self) -> Self::Mask;
 
+    /// Casts to a pointer of another type.
+    ///
+    /// Equivalent to calling [`pointer::cast`] on each lane.
+    fn cast<T>(self) -> Self::CastPtr<T>;
+
     /// Changes constness without changing the type.
     ///
     /// Equivalent to calling [`pointer::cast_const`] on each lane.
@@ -73,6 +81,7 @@ where
 {
     type Usize = Simd<usize, LANES>;
     type Isize = Simd<isize, LANES>;
+    type CastPtr<U> = Simd<*mut U, LANES>;
     type ConstPtr = Simd<*const T, LANES>;
     type Mask = Mask<isize, LANES>;
 
@@ -81,9 +90,22 @@ where
         Simd::splat(core::ptr::null_mut()).simd_eq(self)
     }
 
+    #[inline]
+    fn cast<U>(self) -> Self::CastPtr<U> {
+        // SimdElement currently requires zero-sized metadata, so this should never fail.
+        // If this ever changes, `simd_cast_ptr` should produce a post-mono error.
+        use core::{mem::size_of, ptr::Pointee};
+        assert_eq!(size_of::<<T as Pointee>::Metadata>(), 0);
+        assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
+
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
+    }
+
     #[inline]
     fn cast_const(self) -> Self::ConstPtr {
-        self.cast_ptr()
+        // Safety: pointers can be cast
+        unsafe { intrinsics::simd_cast_ptr(self) }
     }
 
     #[inline]
@@ -101,9 +123,9 @@ where
         // In the mean-time, this operation is defined to be "as if" it was
         // a wrapping_offset, so we can emulate it as such. This should properly
         // restore pointer provenance even under today's compiler.
-        self.cast_ptr::<*mut u8>()
+        self.cast::<u8>()
             .wrapping_offset(addr.cast::<isize>() - self.addr().cast::<isize>())
-            .cast_ptr()
+            .cast()
     }
 
     #[inline]
diff --git a/crates/core_simd/src/elements/uint.rs b/crates/core_simd/src/elements/uint.rs
index 21e7e76eb3de5..3926c395ec9ac 100644
--- a/crates/core_simd/src/elements/uint.rs
+++ b/crates/core_simd/src/elements/uint.rs
@@ -1,11 +1,21 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Simd, SupportedLaneCount};
+use crate::simd::{intrinsics, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
 
 /// Operations on SIMD vectors of unsigned integers.
 pub trait SimdUint: Copy + Sealed {
     /// Scalar type contained by this SIMD vector type.
     type Scalar;
 
+    /// A SIMD vector with a different element type.
+    type Cast<T: SimdElement>;
+
+    /// Performs elementwise conversion of this vector's elements to another SIMD-valid type.
+    ///
+    /// This follows the semantics of Rust's `as` conversion for casting integers (wrapping to
+    /// other integer types, and saturating to float types).
+    #[must_use]
+    fn cast<T: SimdCast>(self) -> Self::Cast<T>;
+
     /// Lanewise saturating add.
     ///
     /// # Examples
@@ -77,6 +87,13 @@ macro_rules! impl_trait {
             LaneCount<LANES>: SupportedLaneCount,
         {
             type Scalar = $ty;
+            type Cast<T: SimdElement> = Simd<T, LANES>;
+
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T> {
+                // Safety: supported types are guaranteed by SimdCast
+                unsafe { intrinsics::simd_as(self) }
+            }
 
             #[inline]
             fn saturating_add(self, second: Self) -> Self {
diff --git a/crates/core_simd/src/iter.rs b/crates/core_simd/src/iter.rs
index 3275b4db8e49f..328c995b81ddd 100644
--- a/crates/core_simd/src/iter.rs
+++ b/crates/core_simd/src/iter.rs
@@ -10,6 +10,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(0 as $type), Add::add)
             }
@@ -19,6 +20,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(1 as $type), Mul::mul)
             }
@@ -28,6 +30,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn sum<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(0 as $type), Add::add)
             }
@@ -37,6 +40,7 @@ macro_rules! impl_traits {
         where
             LaneCount<LANES>: SupportedLaneCount,
         {
+            #[inline]
             fn product<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
                 iter.fold(Simd::splat(1 as $type), Mul::mul)
             }
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index e5307de215520..fde406bda7060 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -16,7 +16,7 @@
 )]
 #![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
 #![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]
-#![warn(missing_docs)]
+#![warn(missing_docs, clippy::missing_inline_in_public_items)] // basically all items, really
 #![deny(unsafe_op_in_unsafe_fn, clippy::undocumented_unsafe_blocks)]
 #![unstable(feature = "portable_simd", issue = "86656")]
 //! Portable SIMD module.
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index e0f3c7beef689..fea687bdc1aef 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -179,6 +179,7 @@ where
     /// Panics if any lane is not 0 or -1.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
+    #[track_caller]
     pub fn from_int(value: Simd<T, LANES>) -> Self {
         assert!(T::valid(value), "all values must be either 0 or -1",);
         // Safety: the validity has been checked
@@ -217,6 +218,7 @@ where
     /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
+    #[track_caller]
     pub fn test(&self, lane: usize) -> bool {
         assert!(lane < LANES, "lane index out of range");
         // Safety: the lane index has been checked
@@ -240,6 +242,7 @@ where
     /// # Panics
     /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
     #[inline]
+    #[track_caller]
     pub fn set(&mut self, lane: usize, value: bool) {
         assert!(lane < LANES, "lane index out of range");
         // Safety: the lane index has been checked
@@ -327,6 +330,7 @@ where
     T: MaskElement + fmt::Debug,
     LaneCount<LANES>: SupportedLaneCount,
 {
+    #[inline]
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_list()
             .entries((0..LANES).map(|lane| self.test(lane)))
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index 35c659b7a429a..f9891a3b7c1d7 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -23,6 +23,8 @@ mod vendor;
 
 #[doc = include_str!("core_simd_docs.md")]
 pub mod simd {
+    pub mod prelude;
+
     pub(crate) use crate::core_simd::intrinsics;
 
     pub use crate::core_simd::alias::*;
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index fc1e0bc426df3..b007456cf2cc5 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -15,6 +15,7 @@ where
     I: core::slice::SliceIndex<[T]>,
 {
     type Output = I::Output;
+    #[inline]
     fn index(&self, index: I) -> &Self::Output {
         &self.as_array()[index]
     }
@@ -26,6 +27,7 @@ where
     LaneCount<LANES>: SupportedLaneCount,
     I: core::slice::SliceIndex<[T]>,
 {
+    #[inline]
     fn index_mut(&mut self, index: I) -> &mut Self::Output {
         &mut self.as_mut_array()[index]
     }
@@ -118,10 +120,14 @@ macro_rules! for_base_types {
 
                     #[inline]
                     #[must_use = "operator returns a new vector without mutating the inputs"]
+                    // TODO: only useful for int Div::div, but we hope that this
+                    // will essentially always always get inlined anyway.
+                    #[track_caller]
                     fn $call(self, rhs: Self) -> Self::Output {
                         $macro_impl!(self, rhs, $inner, $scalar)
                     }
-                })*
+                }
+            )*
     }
 }
 
diff --git a/crates/core_simd/src/ord.rs b/crates/core_simd/src/ord.rs
index 1ae9cd061fb2d..b2455190e8231 100644
--- a/crates/core_simd/src/ord.rs
+++ b/crates/core_simd/src/ord.rs
@@ -94,6 +94,7 @@ macro_rules! impl_integer {
             }
 
             #[inline]
+            #[track_caller]
             fn simd_clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.simd_le(max).all(),
@@ -200,6 +201,7 @@ macro_rules! impl_mask {
             }
 
             #[inline]
+            #[track_caller]
             fn simd_clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.simd_le(max).all(),
@@ -254,6 +256,7 @@ where
     }
 
     #[inline]
+    #[track_caller]
     fn simd_clamp(self, min: Self, max: Self) -> Self {
         assert!(
             min.simd_le(max).all(),
@@ -303,6 +306,7 @@ where
     }
 
     #[inline]
+    #[track_caller]
     fn simd_clamp(self, min: Self, max: Self) -> Self {
         assert!(
             min.simd_le(max).all(),
diff --git a/crates/core_simd/src/simd/prelude.rs b/crates/core_simd/src/simd/prelude.rs
new file mode 100644
index 0000000000000..e8fdc932d490f
--- /dev/null
+++ b/crates/core_simd/src/simd/prelude.rs
@@ -0,0 +1,80 @@
+//! The portable SIMD prelude.
+//!
+//! Includes important traits and types to be imported with a glob:
+//! ```ignore
+//! use std::simd::prelude::*;
+//! ```
+
+#[doc(no_inline)]
+pub use super::{
+    simd_swizzle, Mask, Simd, SimdConstPtr, SimdFloat, SimdInt, SimdMutPtr, SimdOrd, SimdPartialEq,
+    SimdPartialOrd, SimdUint,
+};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{f32x1, f32x2, f32x4, f32x8, f32x16, f32x32, f32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{f64x1, f64x2, f64x4, f64x8, f64x16, f64x32, f64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i8x1, i8x2, i8x4, i8x8, i8x16, i8x32, i8x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i16x1, i16x2, i16x4, i16x8, i16x16, i16x32, i16x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i32x1, i32x2, i32x4, i32x8, i32x16, i32x32, i32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{i64x1, i64x2, i64x4, i64x8, i64x16, i64x32, i64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{isizex1, isizex2, isizex4, isizex8, isizex16, isizex32, isizex64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u8x1, u8x2, u8x4, u8x8, u8x16, u8x32, u8x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u16x1, u16x2, u16x4, u16x8, u16x16, u16x32, u16x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u32x1, u32x2, u32x4, u32x8, u32x16, u32x32, u32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{u64x1, u64x2, u64x4, u64x8, u64x16, u64x32, u64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{usizex1, usizex2, usizex4, usizex8, usizex16, usizex32, usizex64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask8x1, mask8x2, mask8x4, mask8x8, mask8x16, mask8x32, mask8x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask16x1, mask16x2, mask16x4, mask16x8, mask16x16, mask16x32, mask16x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask32x1, mask32x2, mask32x4, mask32x8, mask32x16, mask32x32, mask32x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{mask64x1, mask64x2, mask64x4, mask64x8, mask64x16, mask64x32, mask64x64};
+
+#[rustfmt::skip]
+#[doc(no_inline)]
+pub use super::{masksizex1, masksizex2, masksizex4, masksizex8, masksizex16, masksizex32, masksizex64};
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 6065d6459378e..ce621792534e1 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -16,9 +16,14 @@ where
     #[inline]
     pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
         #![allow(unused_imports, unused_unsafe)]
-        #[cfg(target_arch = "aarch64")]
+        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
         use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
-        #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+        #[cfg(all(
+            target_arch = "arm",
+            target_feature = "v7",
+            target_feature = "neon",
+            target_endian = "little"
+        ))]
         use core::arch::arm::{uint8x8_t, vtbl1_u8};
         #[cfg(target_arch = "wasm32")]
         use core::arch::wasm32 as wasm;
@@ -29,13 +34,24 @@ where
         // SAFETY: Intrinsics covered by cfg
         unsafe {
             match N {
-                #[cfg(target_feature = "neon")]
+                #[cfg(all(
+                    any(
+                        target_arch = "aarch64",
+                        all(target_arch = "arm", target_feature = "v7")
+                    ),
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
                 8 => transize(vtbl1_u8, self, idxs),
                 #[cfg(target_feature = "ssse3")]
                 16 => transize(x86::_mm_shuffle_epi8, self, idxs),
                 #[cfg(target_feature = "simd128")]
                 16 => transize(wasm::i8x16_swizzle, self, idxs),
-                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                #[cfg(all(
+                    target_arch = "aarch64",
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
                 16 => transize(vqtbl1q_u8, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize_raw(avx2_pshufb, self, idxs),
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 3809cc961515b..9aa7bacfce981 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,6 +1,6 @@
 use crate::simd::{
-    intrinsics, LaneCount, Mask, MaskElement, SimdCast, SimdCastPtr, SimdConstPtr, SimdMutPtr,
-    SimdPartialOrd, SupportedLaneCount, Swizzle,
+    intrinsics, LaneCount, Mask, MaskElement, SimdConstPtr, SimdMutPtr, SimdPartialOrd,
+    SupportedLaneCount, Swizzle,
 };
 use core::convert::{TryFrom, TryInto};
 
@@ -122,6 +122,7 @@ where
     /// let v = u32x4::splat(0);
     /// assert_eq!(v.lanes(), 4);
     /// ```
+    #[inline]
     pub const fn lanes(&self) -> usize {
         Self::LANES
     }
@@ -136,6 +137,7 @@ where
     /// let v = u32x4::splat(8);
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
+    #[inline]
     pub fn splat(value: T) -> Self {
         // This is preferred over `[value; N]`, since it's explicitly a splat:
         // https://github.com/rust-lang/rust/issues/97804
@@ -156,6 +158,7 @@ where
     /// let v: u64x4 = Simd::from_array([0, 1, 2, 3]);
     /// assert_eq!(v.as_array(), &[0, 1, 2, 3]);
     /// ```
+    #[inline]
     pub const fn as_array(&self) -> &[T; N] {
         // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
         // potential padding at the end, so pointer casting to a
@@ -167,6 +170,7 @@ where
     }
 
     /// Returns a mutable array reference containing the entire SIMD vector.
+    #[inline]
     pub fn as_mut_array(&mut self) -> &mut [T; N] {
         // SAFETY: `Simd<T, N>` is just an overaligned `[T; N]` with
         // potential padding at the end, so pointer casting to a
@@ -184,6 +188,7 @@ where
     ///
     /// # Safety
     /// Reading `ptr` must be safe, as if by `<*const [T; N]>::read_unaligned`.
+    #[inline]
     const unsafe fn load(ptr: *const [T; N]) -> Self {
         // There are potentially simpler ways to write this function, but this should result in
         // LLVM `load <N x T>`
@@ -204,6 +209,7 @@ where
     ///
     /// # Safety
     /// Writing to `ptr` must be safe, as if by `<*mut [T; N]>::write_unaligned`.
+    #[inline]
     const unsafe fn store(self, ptr: *mut [T; N]) {
         // There are potentially simpler ways to write this function, but this should result in
         // LLVM `store <N x T>`
@@ -216,6 +222,7 @@ where
     }
 
     /// Converts an array to a SIMD vector.
+    #[inline]
     pub const fn from_array(array: [T; N]) -> Self {
         // SAFETY: `&array` is safe to read.
         //
@@ -228,6 +235,7 @@ where
     }
 
     /// Converts a SIMD vector to an array.
+    #[inline]
     pub const fn to_array(self) -> [T; N] {
         let mut tmp = core::mem::MaybeUninit::uninit();
         // SAFETY: writing to `tmp` is safe and initializes it.
@@ -259,6 +267,8 @@ where
     /// assert_eq!(v.as_array(), &[1, 2, 3, 4]);
     /// ```
     #[must_use]
+    #[inline]
+    #[track_caller]
     pub const fn from_slice(slice: &[T]) -> Self {
         assert!(
             slice.len() >= Self::LANES,
@@ -287,6 +297,8 @@ where
     /// v.copy_to_slice(&mut dest);
     /// assert_eq!(&dest, &[1, 2, 3, 4, 0, 0]);
     /// ```
+    #[inline]
+    #[track_caller]
     pub fn copy_to_slice(self, slice: &mut [T]) {
         assert!(
             slice.len() >= Self::LANES,
@@ -297,76 +309,6 @@ where
         unsafe { self.store(slice.as_mut_ptr().cast()) }
     }
 
-    /// Performs elementwise conversion of a SIMD vector's elements to another SIMD-valid type.
-    ///
-    /// This follows the semantics of Rust's `as` conversion for casting integers between
-    /// signed and unsigned (interpreting integers as 2s complement, so `-1` to `U::MAX` and
-    /// `1 << (U::BITS -1)` becoming `I::MIN` ), and from floats to integers (truncating,
-    /// or saturating at the limits) for each element.
-    ///
-    /// # Examples
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # use core::simd::Simd;
-    /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
-    /// let ints = floats.cast::<i32>();
-    /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
-    ///
-    /// // Formally equivalent, but `Simd::cast` can optimize better.
-    /// assert_eq!(ints, Simd::from_array(floats.to_array().map(|x| x as i32)));
-    ///
-    /// // The float conversion does not round-trip.
-    /// let floats_again = ints.cast();
-    /// assert_ne!(floats, floats_again);
-    /// assert_eq!(floats_again, Simd::from_array([1.0, -4.0, 2147483647.0, 0.0]));
-    /// ```
-    #[must_use]
-    #[inline]
-    pub fn cast<U: SimdCast>(self) -> Simd<U, N>
-    where
-        T: SimdCast,
-    {
-        // Safety: supported types are guaranteed by SimdCast
-        unsafe { intrinsics::simd_as(self) }
-    }
-
-    /// Casts a vector of pointers to another pointer type.
-    #[must_use]
-    #[inline]
-    pub fn cast_ptr<U>(self) -> Simd<U, N>
-    where
-        T: SimdCastPtr<U>,
-        U: SimdElement,
-    {
-        // Safety: supported types are guaranteed by SimdCastPtr
-        unsafe { intrinsics::simd_cast_ptr(self) }
-    }
-
-    /// Rounds toward zero and converts to the same-width integer type, assuming that
-    /// the value is finite and fits in that type.
-    ///
-    /// # Safety
-    /// The value must:
-    ///
-    /// * Not be NaN
-    /// * Not be infinite
-    /// * Be representable in the return type, after truncating off its fractional part
-    ///
-    /// If these requirements are infeasible or costly, consider using the safe function [cast],
-    /// which saturates on conversion.
-    ///
-    /// [cast]: Simd::cast
-    #[inline]
-    #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
-    pub unsafe fn to_int_unchecked<I>(self) -> Simd<I, N>
-    where
-        T: core::convert::FloatToInt<I> + SimdCast,
-        I: SimdCast,
-    {
-        // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
-        unsafe { intrinsics::simd_cast(self) }
-    }
-
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
     /// If an index is out-of-bounds, the element is instead selected from the `or` vector.
     ///
@@ -717,6 +659,7 @@ where
     LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
+    #[inline]
     fn clone(&self) -> Self {
         *self
     }
@@ -861,6 +804,7 @@ where
     LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
+    #[inline]
     fn from(array: [T; N]) -> Self {
         Self::from_array(array)
     }
@@ -871,6 +815,7 @@ where
     LaneCount<N>: SupportedLaneCount,
     T: SimdElement,
 {
+    #[inline]
     fn from(vector: Simd<T, N>) -> Self {
         vector.to_array()
     }
@@ -883,6 +828,7 @@ where
 {
     type Error = core::array::TryFromSliceError;
 
+    #[inline]
     fn try_from(slice: &[T]) -> Result<Self, core::array::TryFromSliceError> {
         Ok(Self::from_array(slice.try_into()?))
     }
@@ -895,6 +841,7 @@ where
 {
     type Error = core::array::TryFromSliceError;
 
+    #[inline]
     fn try_from(slice: &mut [T]) -> Result<Self, core::array::TryFromSliceError> {
         Ok(Self::from_array(slice.try_into()?))
     }
diff --git a/crates/core_simd/tests/cast.rs b/crates/core_simd/tests/cast.rs
index ab5650f071323..00545936ea2a6 100644
--- a/crates/core_simd/tests/cast.rs
+++ b/crates/core_simd/tests/cast.rs
@@ -2,7 +2,8 @@
 macro_rules! cast_types {
     ($start:ident, $($target:ident),*) => {
         mod $start {
-            use core_simd::simd::Simd;
+            #[allow(unused)]
+            use core_simd::simd::{Simd, SimdInt, SimdUint, SimdFloat};
             type Vector<const N: usize> = Simd<$start, N>;
             $(
                 mod $target {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index 8b9638ad46671..aacf7bd3bcc2c 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -53,6 +53,7 @@ macro_rules! float_rounding_test {
 
             test_helpers::test_lanes! {
                 fn to_int_unchecked<const LANES: usize>() {
+                    use core_simd::simd::SimdFloat;
                     // The maximum integer that can be represented by the equivalently sized float has
                     // all of the mantissa digits set to 1, pushed up to the MSB.
                     const ALL_MANTISSA_BITS: IntScalar = ((1 << <Scalar>::MANTISSA_DIGITS) - 1);

From 78c05f92b8b72fdf9a455a2ce9586494cba1767f Mon Sep 17 00:00:00 2001
From: cui fliter <imcusg@gmail.com>
Date: Sun, 16 Jul 2023 00:37:30 +0800
Subject: [PATCH 02/74] remove repetitive words

Signed-off-by: cui fliter <imcusg@gmail.com>
---
 crates/core_simd/examples/dot_product.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index 391f08f55a07a..a7973ec740411 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -130,7 +130,7 @@ pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
 }
 
 // This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
-// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration.
+// Notice the use of `mul_add`, which can do a multiply and an add operation ber iteration.
 pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
     a.array_chunks::<4>()
         .map(|&a| f32x4::from_array(a))

From 589572941bfbe2530450c729c0ffab21445db397 Mon Sep 17 00:00:00 2001
From: bohan <bohan-zhang@foxmail.com>
Date: Sun, 15 Oct 2023 19:38:22 +0800
Subject: [PATCH 03/74] use visibility to check unused imports and delete some
 stmts

---
 crates/core_simd/src/mod.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index f9891a3b7c1d7..19426769858b0 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -35,6 +35,5 @@ pub mod simd {
     pub use crate::core_simd::masks::*;
     pub use crate::core_simd::ord::*;
     pub use crate::core_simd::swizzle::*;
-    pub use crate::core_simd::swizzle_dyn::*;
     pub use crate::core_simd::vector::*;
 }

From 62030c77aeb031b0ca203c5bfe664d633bbe1283 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 26 Nov 2023 08:50:06 -0500
Subject: [PATCH 04/74] Merge commit 'e0e9a4517f9fc021283514da387e70a56061bd3e'
 into sync-portable-simd-2023-11-19

---
 .github/workflows/ci.yml                      |  71 +--
 .gitignore                                    |   1 -
 Cargo.lock                                    | 304 +++++++++++++
 crates/core_simd/Cargo.toml                   |   1 -
 crates/core_simd/examples/dot_product.rs      |   2 +-
 crates/core_simd/examples/matrix_inversion.rs |  49 ++-
 crates/core_simd/examples/nbody.rs            |   3 +-
 crates/core_simd/examples/spectral_norm.rs    |   2 +-
 crates/core_simd/src/core_simd_docs.md        |  35 ++
 crates/core_simd/src/fmt.rs                   |   4 +-
 crates/core_simd/src/intrinsics.rs            |   6 +
 crates/core_simd/src/iter.rs                  |  16 +-
 crates/core_simd/src/lane_count.rs            |   8 +-
 crates/core_simd/src/lib.rs                   |   4 +-
 crates/core_simd/src/masks.rs                 | 379 ++++++++++------
 crates/core_simd/src/masks/bitmask.rs         | 118 +++--
 crates/core_simd/src/masks/full_masks.rs      | 185 ++++----
 crates/core_simd/src/masks/to_bitmask.rs      |  97 -----
 crates/core_simd/src/mod.rs                   |  20 +-
 crates/core_simd/src/ops.rs                   |  11 +-
 crates/core_simd/src/ops/assign.rs            |  26 +-
 crates/core_simd/src/ops/deref.rs             |  46 +-
 crates/core_simd/src/ops/shift_scalar.rs      |  62 +++
 crates/core_simd/src/ops/unary.rs             |  46 +-
 crates/core_simd/src/select.rs                |  22 +-
 crates/core_simd/src/simd/cmp.rs              |   7 +
 crates/core_simd/src/{ => simd/cmp}/eq.rs     |  30 +-
 crates/core_simd/src/{ => simd/cmp}/ord.rs    |  67 +--
 .../src/{elements.rs => simd/num.rs}          |   6 +-
 .../src/{elements => simd/num}/float.rs       |  97 +++--
 .../src/{elements => simd/num}/int.rs         | 114 +++--
 .../src/{elements => simd/num}/uint.rs        |  97 ++++-
 crates/core_simd/src/simd/prelude.rs          |   6 +-
 crates/core_simd/src/simd/ptr.rs              |  11 +
 .../src/{elements => simd/ptr}/const_ptr.rs   |  47 +-
 .../src/{elements => simd/ptr}/mut_ptr.rs     |  45 +-
 crates/core_simd/src/swizzle.rs               | 408 +++++++++---------
 crates/core_simd/src/swizzle_dyn.rs           |   4 +-
 crates/core_simd/src/to_bytes.rs              | 132 +++++-
 crates/core_simd/src/vector.rs                |  43 +-
 crates/core_simd/src/vendor.rs                |   2 +-
 crates/core_simd/src/vendor/x86.rs            |   2 +-
 crates/core_simd/tests/cast.rs                |   2 +-
 crates/core_simd/tests/masks.rs               |  14 +-
 crates/core_simd/tests/ops_macros.rs          | 135 +++++-
 crates/core_simd/tests/pointers.rs            |   5 +-
 crates/core_simd/tests/round.rs               |   4 +-
 crates/core_simd/tests/swizzle.rs             |  28 +-
 crates/core_simd/tests/swizzle_dyn.rs         |   1 -
 crates/core_simd/tests/to_bytes.rs            |  22 +-
 crates/std_float/src/lib.rs                   |   9 +-
 crates/test_helpers/Cargo.toml                |   6 +-
 crates/test_helpers/src/biteq.rs              |  32 +-
 crates/test_helpers/src/lib.rs                | 128 +++++-
 crates/test_helpers/src/subnormals.rs         |  91 ++++
 55 files changed, 2107 insertions(+), 1006 deletions(-)
 create mode 100644 Cargo.lock
 delete mode 100644 crates/core_simd/src/masks/to_bitmask.rs
 create mode 100644 crates/core_simd/src/ops/shift_scalar.rs
 create mode 100644 crates/core_simd/src/simd/cmp.rs
 rename crates/core_simd/src/{ => simd/cmp}/eq.rs (74%)
 rename crates/core_simd/src/{ => simd/cmp}/ord.rs (79%)
 rename crates/core_simd/src/{elements.rs => simd/num.rs} (63%)
 rename crates/core_simd/src/{elements => simd/num}/float.rs (80%)
 rename crates/core_simd/src/{elements => simd/num}/int.rs (71%)
 rename crates/core_simd/src/{elements => simd/num}/uint.rs (58%)
 create mode 100644 crates/core_simd/src/simd/ptr.rs
 rename crates/core_simd/src/{elements => simd/ptr}/const_ptr.rs (81%)
 rename crates/core_simd/src/{elements => simd/ptr}/mut_ptr.rs (81%)
 create mode 100644 crates/test_helpers/src/subnormals.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1ff377fce3467..90543044ea845 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -167,40 +167,33 @@ jobs:
             RUSTFLAGS: ${{ matrix.rustflags }}
 
   cross-tests:
-    name: "${{ matrix.target }} (via cross)"
+    name: "${{ matrix.target_feature }} on ${{ matrix.target }} (via cross)"
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
-      # TODO: Sadly, we cant configure target-feature in a meaningful way
-      # because `cross` doesn't tell qemu to enable any non-default cpu
-      # features, nor does it give us a way to do so.
-      #
-      # Ultimately, we'd like to do something like [rust-lang/stdarch][stdarch].
-      # This is a lot more complex... but in practice it's likely that we can just
-      # snarf the docker config from around [here][1000-dockerfiles].
-      #
-      # [stdarch]: https://github.com/rust-lang/stdarch/blob/a5db4eaf/.github/workflows/main.yml#L67
-      # [1000-dockerfiles]: https://github.com/rust-lang/stdarch/tree/a5db4eaf/ci/docker
 
       matrix:
         target:
-          - i586-unknown-linux-gnu
-          # 32-bit arm has a few idiosyncracies like having subnormal flushing
-          # to zero on by default. Ideally we'd set
           - armv7-unknown-linux-gnueabihf
-          - aarch64-unknown-linux-gnu
-          # Note: The issue above means neither of these mips targets will use
-          # MSA (mips simd) but MIPS uses a nonstandard binary representation
-          # for NaNs which makes it worth testing on despite that.
+          - thumbv7neon-unknown-linux-gnueabihf # includes neon by default
+          - aarch64-unknown-linux-gnu           # includes neon by default
+          - powerpc-unknown-linux-gnu
+          - powerpc64le-unknown-linux-gnu       # includes altivec by default
+          - riscv64gc-unknown-linux-gnu
+          # MIPS uses a nonstandard binary representation for NaNs which makes it worth testing
+          # non-nightly since https://github.com/rust-lang/rust/pull/113274
           # - mips-unknown-linux-gnu
           # - mips64-unknown-linux-gnuabi64
-          - riscv64gc-unknown-linux-gnu
-          # TODO this test works, but it appears to time out
-          # - powerpc-unknown-linux-gnu
-          # TODO this test is broken, but it appears to be a problem with QEMU, not us.
-          # - powerpc64le-unknown-linux-gnu
-          # TODO enable this once a new version of cross is released
+          # Lots of errors in QEMU and no real hardware to test on. Not clear if it's QEMU or bad codegen.
           # - powerpc64-unknown-linux-gnu
+        target_feature: [default]
+        include:
+          - { target: powerpc64le-unknown-linux-gnu, target_feature: "+vsx" }
+          # Fails due to QEMU floating point errors, probably handling subnormals incorrectly.
+          # This target is somewhat redundant, since ppc64le has altivec as well.
+          # - { target: powerpc-unknown-linux-gnu, target_feature: "+altivec" }
+          # We should test this, but cross currently can't run it
+          # - { target: riscv64gc-unknown-linux-gnu, target_feature: "+v,+zvl128b" }
 
     steps:
       - uses: actions/checkout@v2
@@ -217,11 +210,27 @@ jobs:
         # being part of the tarball means we can't just use the download/latest
         # URL :(
         run: |
-          CROSS_URL=https://github.com/rust-embedded/cross/releases/download/v0.2.1/cross-v0.2.1-x86_64-unknown-linux-gnu.tar.gz
+          CROSS_URL=https://github.com/cross-rs/cross/releases/download/v0.2.5/cross-x86_64-unknown-linux-gnu.tar.gz
           mkdir -p "$HOME/.bin"
           curl -sfSL --retry-delay 10 --retry 5 "${CROSS_URL}" | tar zxf - -C "$HOME/.bin"
           echo "$HOME/.bin" >> $GITHUB_PATH
 
+      - name: Configure Emulated CPUs
+        run: |
+          echo "CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER=qemu-ppc -cpu e600" >> $GITHUB_ENV
+          # echo "CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER=qemu-riscv64 -cpu rv64,zba=true,zbb=true,v=true,vlen=256,vext_spec=v1.0" >> $GITHUB_ENV
+
+      - name: Configure RUSTFLAGS
+        shell: bash
+        run: |
+          case "${{ matrix.target_feature }}" in
+            default)
+              echo "RUSTFLAGS=" >> $GITHUB_ENV;;
+            *)
+              echo "RUSTFLAGS=-Ctarget-feature=${{ matrix.target_feature }}" >> $GITHUB_ENV
+              ;;
+          esac
+
       - name: Test (debug)
         run: cross test --verbose --target=${{ matrix.target }}
 
@@ -229,7 +238,7 @@ jobs:
         run: cross test --verbose --target=${{ matrix.target }} --release
 
   features:
-    name: "Check cargo features (${{ matrix.simd }} × ${{ matrix.features }})"
+    name: "Test cargo features (${{ matrix.simd }} × ${{ matrix.features }})"
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
@@ -240,12 +249,8 @@ jobs:
         features:
           - ""
           - "--features std"
-          - "--features generic_const_exprs"
-          - "--features std --features generic_const_exprs"
           - "--features all_lane_counts"
-          - "--features all_lane_counts --features std"
-          - "--features all_lane_counts --features generic_const_exprs"
-          - "--features all_lane_counts --features std --features generic_const_exprs"
+          - "--all-features"
 
     steps:
       - uses: actions/checkout@v2
@@ -257,9 +262,9 @@ jobs:
         run: echo "CPU_FEATURE=$(lscpu | grep -o avx512[a-z]* | sed s/avx/+avx/ | tr '\n' ',' )" >> $GITHUB_ENV
       - name: Check build
         if: ${{ matrix.simd == '' }}
-        run: RUSTFLAGS="-Dwarnings" cargo check --all-targets --no-default-features ${{ matrix.features }}
+        run: RUSTFLAGS="-Dwarnings" cargo test --all-targets --no-default-features ${{ matrix.features }}
       - name: Check AVX
         if: ${{ matrix.simd == 'avx512' && contains(env.CPU_FEATURE, 'avx512') }}
         run: |
           echo "Found AVX features: $CPU_FEATURE"
-          RUSTFLAGS="-Dwarnings -Ctarget-feature=$CPU_FEATURE" cargo check --all-targets --no-default-features ${{ matrix.features }}
+          RUSTFLAGS="-Dwarnings -Ctarget-feature=$CPU_FEATURE" cargo test --all-targets --no-default-features ${{ matrix.features }}
diff --git a/.gitignore b/.gitignore
index 96ef6c0b944e2..ea8c4bf7f35f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1 @@
 /target
-Cargo.lock
diff --git a/Cargo.lock b/Cargo.lock
new file mode 100644
index 0000000000000..46312c09657d5
--- /dev/null
+++ b/Cargo.lock
@@ -0,0 +1,304 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bumpalo"
+version = "3.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1"
+
+[[package]]
+name = "byteorder"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "console_error_panic_hook"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "core_simd"
+version = "0.1.0"
+dependencies = [
+ "proptest",
+ "std_float",
+ "test_helpers",
+ "wasm-bindgen",
+ "wasm-bindgen-test",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "log"
+version = "0.4.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+
+[[package]]
+name = "num-traits"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18fb31db3f9bddb2ea821cde30a9f70117e3f119938b5ee630b7403aa6e2ead9"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "proptest"
+version = "0.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12e6c80c1139113c28ee4670dc50cc42915228b51f56a9e407f0ec60f966646f"
+dependencies = [
+ "bitflags",
+ "byteorder",
+ "num-traits",
+ "rand",
+ "rand_chacha",
+ "rand_xorshift",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
+dependencies = [
+ "rand_chacha",
+ "rand_core",
+ "rand_hc",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
+
+[[package]]
+name = "rand_hc"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "rand_xorshift"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77d416b86801d23dde1aa643023b775c3a462efc0ed96443add11546cdf1dca8"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "scoped-tls"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294"
+
+[[package]]
+name = "std_float"
+version = "0.1.0"
+dependencies = [
+ "core_simd",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c324c494eba9d92503e6f1ef2e6df781e78f6a7705a0202d9801b198807d518a"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "test_helpers"
+version = "0.1.0"
+dependencies = [
+ "proptest",
+]
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.87"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1"
+
+[[package]]
+name = "wasm-bindgen-test"
+version = "0.3.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e6e302a7ea94f83a6d09e78e7dc7d9ca7b186bc2829c24a22d0753efd680671"
+dependencies = [
+ "console_error_panic_hook",
+ "js-sys",
+ "scoped-tls",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-bindgen-test-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-test-macro"
+version = "0.3.37"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ecb993dd8c836930ed130e020e77d9b2e65dd0fbab1b67c790b0f5d80b11a575"
+dependencies = [
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.64"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index d1a3a515a7e81..b4a8fd70f4c0e 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -12,7 +12,6 @@ license = "MIT OR Apache-2.0"
 default = ["as_crate"]
 as_crate = []
 std = []
-generic_const_exprs = []
 all_lane_counts = []
 
 [target.'cfg(target_arch = "wasm32")'.dev-dependencies]
diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index a7973ec740411..f047010a65c16 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -6,7 +6,7 @@
 #![feature(slice_as_chunks)]
 // Add these imports to use the stdsimd library
 #![feature(portable_simd)]
-use core_simd::simd::*;
+use core_simd::simd::prelude::*;
 
 // This is your barebones dot product implementation:
 // Take 2 vectors, multiply them element wise and *then*
diff --git a/crates/core_simd/examples/matrix_inversion.rs b/crates/core_simd/examples/matrix_inversion.rs
index 39f530f68f57a..bad86414401d7 100644
--- a/crates/core_simd/examples/matrix_inversion.rs
+++ b/crates/core_simd/examples/matrix_inversion.rs
@@ -2,8 +2,7 @@
 // Code ported from the `packed_simd` crate
 // Run this code with `cargo test --example matrix_inversion`
 #![feature(array_chunks, portable_simd)]
-use core_simd::simd::*;
-use Which::*;
+use core_simd::simd::prelude::*;
 
 // Gotta define our own 4x4 matrix since Rust doesn't ship multidim arrays yet :^)
 #[derive(Copy, Clone, Debug, PartialEq, PartialOrd)]
@@ -164,10 +163,10 @@ pub fn simd_inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
     let m_2 = f32x4::from_array(m[2]);
     let m_3 = f32x4::from_array(m[3]);
 
-    const SHUFFLE01: [Which; 4] = [First(0), First(1), Second(0), Second(1)];
-    const SHUFFLE02: [Which; 4] = [First(0), First(2), Second(0), Second(2)];
-    const SHUFFLE13: [Which; 4] = [First(1), First(3), Second(1), Second(3)];
-    const SHUFFLE23: [Which; 4] = [First(2), First(3), Second(2), Second(3)];
+    const SHUFFLE01: [usize; 4] = [0, 1, 4, 5];
+    const SHUFFLE02: [usize; 4] = [0, 2, 4, 6];
+    const SHUFFLE13: [usize; 4] = [1, 3, 5, 7];
+    const SHUFFLE23: [usize; 4] = [2, 3, 6, 7];
 
     let tmp = simd_swizzle!(m_0, m_1, SHUFFLE01);
     let row1 = simd_swizzle!(m_2, m_3, SHUFFLE01);
@@ -180,58 +179,58 @@ pub fn simd_inv4x4(m: Matrix4x4) -> Option<Matrix4x4> {
     let row2 = simd_swizzle!(tmp, row3, SHUFFLE02);
     let row3 = simd_swizzle!(row3, tmp, SHUFFLE13);
 
-    let tmp = (row2 * row3).reverse().rotate_lanes_right::<2>();
+    let tmp = (row2 * row3).reverse().rotate_elements_right::<2>();
     let minor0 = row1 * tmp;
     let minor1 = row0 * tmp;
-    let tmp = tmp.rotate_lanes_right::<2>();
+    let tmp = tmp.rotate_elements_right::<2>();
     let minor0 = (row1 * tmp) - minor0;
     let minor1 = (row0 * tmp) - minor1;
-    let minor1 = minor1.rotate_lanes_right::<2>();
+    let minor1 = minor1.rotate_elements_right::<2>();
 
-    let tmp = (row1 * row2).reverse().rotate_lanes_right::<2>();
+    let tmp = (row1 * row2).reverse().rotate_elements_right::<2>();
     let minor0 = (row3 * tmp) + minor0;
     let minor3 = row0 * tmp;
-    let tmp = tmp.rotate_lanes_right::<2>();
+    let tmp = tmp.rotate_elements_right::<2>();
 
     let minor0 = minor0 - row3 * tmp;
     let minor3 = row0 * tmp - minor3;
-    let minor3 = minor3.rotate_lanes_right::<2>();
+    let minor3 = minor3.rotate_elements_right::<2>();
 
-    let tmp = (row3 * row1.rotate_lanes_right::<2>())
+    let tmp = (row3 * row1.rotate_elements_right::<2>())
         .reverse()
-        .rotate_lanes_right::<2>();
-    let row2 = row2.rotate_lanes_right::<2>();
+        .rotate_elements_right::<2>();
+    let row2 = row2.rotate_elements_right::<2>();
     let minor0 = row2 * tmp + minor0;
     let minor2 = row0 * tmp;
-    let tmp = tmp.rotate_lanes_right::<2>();
+    let tmp = tmp.rotate_elements_right::<2>();
     let minor0 = minor0 - row2 * tmp;
     let minor2 = row0 * tmp - minor2;
-    let minor2 = minor2.rotate_lanes_right::<2>();
+    let minor2 = minor2.rotate_elements_right::<2>();
 
-    let tmp = (row0 * row1).reverse().rotate_lanes_right::<2>();
+    let tmp = (row0 * row1).reverse().rotate_elements_right::<2>();
     let minor2 = minor2 + row3 * tmp;
     let minor3 = row2 * tmp - minor3;
-    let tmp = tmp.rotate_lanes_right::<2>();
+    let tmp = tmp.rotate_elements_right::<2>();
     let minor2 = row3 * tmp - minor2;
     let minor3 = minor3 - row2 * tmp;
 
-    let tmp = (row0 * row3).reverse().rotate_lanes_right::<2>();
+    let tmp = (row0 * row3).reverse().rotate_elements_right::<2>();
     let minor1 = minor1 - row2 * tmp;
     let minor2 = row1 * tmp + minor2;
-    let tmp = tmp.rotate_lanes_right::<2>();
+    let tmp = tmp.rotate_elements_right::<2>();
     let minor1 = row2 * tmp + minor1;
     let minor2 = minor2 - row1 * tmp;
 
-    let tmp = (row0 * row2).reverse().rotate_lanes_right::<2>();
+    let tmp = (row0 * row2).reverse().rotate_elements_right::<2>();
     let minor1 = row3 * tmp + minor1;
     let minor3 = minor3 - row1 * tmp;
-    let tmp = tmp.rotate_lanes_right::<2>();
+    let tmp = tmp.rotate_elements_right::<2>();
     let minor1 = minor1 - row3 * tmp;
     let minor3 = row1 * tmp + minor3;
 
     let det = row0 * minor0;
-    let det = det.rotate_lanes_right::<2>() + det;
-    let det = det.reverse().rotate_lanes_right::<2>() + det;
+    let det = det.rotate_elements_right::<2>() + det;
+    let det = det.reverse().rotate_elements_right::<2>() + det;
 
     if det.reduce_sum() == 0. {
         return None;
diff --git a/crates/core_simd/examples/nbody.rs b/crates/core_simd/examples/nbody.rs
index df38a00967feb..65820d1340bd7 100644
--- a/crates/core_simd/examples/nbody.rs
+++ b/crates/core_simd/examples/nbody.rs
@@ -1,11 +1,12 @@
 #![feature(portable_simd)]
+#![allow(clippy::excessive_precision)]
 extern crate std_float;
 
 /// Benchmarks game nbody code
 /// Taken from the `packed_simd` crate
 /// Run this benchmark with `cargo test --example nbody`
 mod nbody {
-    use core_simd::simd::*;
+    use core_simd::simd::prelude::*;
     #[allow(unused)] // False positive?
     use std_float::StdFloat;
 
diff --git a/crates/core_simd/examples/spectral_norm.rs b/crates/core_simd/examples/spectral_norm.rs
index d576bd0ccee03..bc7934c252235 100644
--- a/crates/core_simd/examples/spectral_norm.rs
+++ b/crates/core_simd/examples/spectral_norm.rs
@@ -1,6 +1,6 @@
 #![feature(portable_simd)]
 
-use core_simd::simd::*;
+use core_simd::simd::prelude::*;
 
 fn a(i: usize, j: usize) -> f64 {
     ((i + j) * (i + j + 1) / 2 + i + 1) as f64
diff --git a/crates/core_simd/src/core_simd_docs.md b/crates/core_simd/src/core_simd_docs.md
index 15e8ed0253e14..fa93155ff5ed1 100644
--- a/crates/core_simd/src/core_simd_docs.md
+++ b/crates/core_simd/src/core_simd_docs.md
@@ -2,3 +2,38 @@ Portable SIMD module.
 
 This module offers a portable abstraction for SIMD operations
 that is not bound to any particular hardware architecture.
+
+# What is "portable"?
+
+This module provides a SIMD implementation that is fast and predictable on any target.
+
+### Portable SIMD works on every target
+
+Unlike target-specific SIMD in `std::arch`, portable SIMD compiles for every target.
+In this regard, it is just like "regular" Rust.
+
+### Portable SIMD is consistent between targets
+
+A program using portable SIMD can expect identical behavior on any target.
+In most regards, [`Simd<T, N>`] can be thought of as a parallelized `[T; N]` and operates like a sequence of `T`.
+
+This has one notable exception: a handful of older architectures (e.g. `armv7` and `powerpc`) flush [subnormal](`f32::is_subnormal`) `f32` values to zero.
+On these architectures, subnormal `f32` input values are replaced with zeros, and any operation producing subnormal `f32` values produces zeros instead.
+This doesn't affect most architectures or programs.
+
+### Operations use the best instructions available
+
+Operations provided by this module compile to the best available SIMD instructions.
+
+Portable SIMD is not a low-level vendor library, and operations in portable SIMD _do not_ necessarily map to a single instruction.
+Instead, they map to a reasonable implementation of the operation for the target.
+
+Consistency between targets is not compromised to use faster or fewer instructions.
+In some cases, `std::arch` will provide a faster function that has slightly different behavior than the `std::simd` equivalent.
+For example, [`_mm_min_ps`](`core::arch::x86_64::_mm_min_ps`)[^1] can be slightly faster than [`SimdFloat::simd_min`](`num::SimdFloat::simd_min`), but does not conform to the IEEE standard also used by [`f32::min`].
+When necessary, [`Simd<T, N>`] can be converted to the types provided by `std::arch` to make use of target-specific functions.
+
+Many targets simply don't have SIMD, or don't support SIMD for a particular element type.
+In those cases, regular scalar operations are generated instead.
+
+[^1]: `_mm_min_ps(x, y)` is equivalent to `x.simd_lt(y).select(x, y)`
diff --git a/crates/core_simd/src/fmt.rs b/crates/core_simd/src/fmt.rs
index b7317969cbb49..3a540f5a04908 100644
--- a/crates/core_simd/src/fmt.rs
+++ b/crates/core_simd/src/fmt.rs
@@ -1,9 +1,9 @@
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::fmt;
 
-impl<T, const LANES: usize> fmt::Debug for Simd<T, LANES>
+impl<T, const N: usize> fmt::Debug for Simd<T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     T: SimdElement + fmt::Debug,
 {
     /// A `Simd<T, N>` has a debug format like the one for `[T]`:
diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
index dd6698e2ba56c..b27893bc7294d 100644
--- a/crates/core_simd/src/intrinsics.rs
+++ b/crates/core_simd/src/intrinsics.rs
@@ -160,4 +160,10 @@ extern "platform-intrinsic" {
 
     /// convert an exposed address back to a pointer
     pub(crate) fn simd_from_exposed_addr<T, U>(addr: T) -> U;
+
+    // Integer operations
+    pub(crate) fn simd_bswap<T>(x: T) -> T;
+    pub(crate) fn simd_bitreverse<T>(x: T) -> T;
+    pub(crate) fn simd_ctlz<T>(x: T) -> T;
+    pub(crate) fn simd_cttz<T>(x: T) -> T;
 }
diff --git a/crates/core_simd/src/iter.rs b/crates/core_simd/src/iter.rs
index 328c995b81ddd..b3732fd74d5f6 100644
--- a/crates/core_simd/src/iter.rs
+++ b/crates/core_simd/src/iter.rs
@@ -6,9 +6,9 @@ use core::{
 
 macro_rules! impl_traits {
     { $type:ty } => {
-        impl<const LANES: usize> Sum<Self> for Simd<$type, LANES>
+        impl<const N: usize> Sum<Self> for Simd<$type, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
@@ -16,9 +16,9 @@ macro_rules! impl_traits {
             }
         }
 
-        impl<const LANES: usize> Product<Self> for Simd<$type, LANES>
+        impl<const N: usize> Product<Self> for Simd<$type, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn product<I: Iterator<Item = Self>>(iter: I) -> Self {
@@ -26,9 +26,9 @@ macro_rules! impl_traits {
             }
         }
 
-        impl<'a, const LANES: usize> Sum<&'a Self> for Simd<$type, LANES>
+        impl<'a, const N: usize> Sum<&'a Self> for Simd<$type, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn sum<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
@@ -36,9 +36,9 @@ macro_rules! impl_traits {
             }
         }
 
-        impl<'a, const LANES: usize> Product<&'a Self> for Simd<$type, LANES>
+        impl<'a, const N: usize> Product<&'a Self> for Simd<$type, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn product<I: Iterator<Item = &'a Self>>(iter: I) -> Self {
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index 2b91eb9e80047..4cd7265ed671e 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -4,11 +4,11 @@ mod sealed {
 use sealed::Sealed;
 
 /// Specifies the number of lanes in a SIMD vector as a type.
-pub struct LaneCount<const LANES: usize>;
+pub struct LaneCount<const N: usize>;
 
-impl<const LANES: usize> LaneCount<LANES> {
+impl<const N: usize> LaneCount<N> {
     /// The number of bytes in a bitmask with this many lanes.
-    pub const BITMASK_LEN: usize = (LANES + 7) / 8;
+    pub const BITMASK_LEN: usize = (N + 7) / 8;
 }
 
 /// Statically guarantees that a lane count is marked as supported.
@@ -21,7 +21,7 @@ pub trait SupportedLaneCount: Sealed {
     type BitMask: Copy + Default + AsRef<[u8]> + AsMut<[u8]>;
 }
 
-impl<const LANES: usize> Sealed for LaneCount<LANES> {}
+impl<const N: usize> Sealed for LaneCount<N> {}
 
 macro_rules! supported_lane_count {
     ($($lanes:literal),+) => {
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index fde406bda7060..64ba9705ef523 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -5,6 +5,7 @@
     const_mut_refs,
     convert_float_to_int,
     decl_macro,
+    inline_const,
     intra_doc_pointers,
     platform_intrinsics,
     repr_simd,
@@ -14,10 +15,9 @@
     strict_provenance,
     ptr_metadata
 )]
-#![cfg_attr(feature = "generic_const_exprs", feature(generic_const_exprs))]
-#![cfg_attr(feature = "generic_const_exprs", allow(incomplete_features))]
 #![warn(missing_docs, clippy::missing_inline_in_public_items)] // basically all items, really
 #![deny(unsafe_op_in_unsafe_fn, clippy::undocumented_unsafe_blocks)]
+#![allow(internal_features)]
 #![unstable(feature = "portable_simd", issue = "86656")]
 //! Portable SIMD module.
 
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index fea687bdc1aef..0623d2bf3d121 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -1,4 +1,4 @@
-//! Types and traits associated with masking lanes of vectors.
+//! Types and traits associated with masking elements of vectors.
 //! Types representing
 #![allow(non_camel_case_types)]
 
@@ -12,13 +12,9 @@
 )]
 mod mask_impl;
 
-mod to_bitmask;
-pub use to_bitmask::ToBitMask;
-
-#[cfg(feature = "generic_const_exprs")]
-pub use to_bitmask::{bitmask_len, ToBitMaskArray};
-
-use crate::simd::{intrinsics, LaneCount, Simd, SimdElement, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{
+    cmp::SimdPartialEq, intrinsics, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount,
+};
 use core::cmp::Ordering;
 use core::{fmt, mem};
 
@@ -32,13 +28,17 @@ mod sealed {
     /// prevent us from ever removing that bound, or from implementing `MaskElement` on
     /// non-`PartialEq` types in the future.
     pub trait Sealed {
-        fn valid<const LANES: usize>(values: Simd<Self, LANES>) -> bool
+        fn valid<const N: usize>(values: Simd<Self, N>) -> bool
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
             Self: SimdElement;
 
         fn eq(self, other: Self) -> bool;
 
+        fn as_usize(self) -> usize;
+
+        type Unsigned: SimdElement;
+
         const TRUE: Self;
 
         const FALSE: Self;
@@ -50,15 +50,15 @@ use sealed::Sealed;
 ///
 /// # Safety
 /// Type must be a signed integer.
-pub unsafe trait MaskElement: SimdElement + Sealed {}
+pub unsafe trait MaskElement: SimdElement<Mask = Self> + SimdCast + Sealed {}
 
 macro_rules! impl_element {
-    { $ty:ty } => {
+    { $ty:ty, $unsigned:ty } => {
         impl Sealed for $ty {
             #[inline]
-            fn valid<const LANES: usize>(value: Simd<Self, LANES>) -> bool
+            fn valid<const N: usize>(value: Simd<Self, N>) -> bool
             where
-                LaneCount<LANES>: SupportedLaneCount,
+                LaneCount<N>: SupportedLaneCount,
             {
                 (value.simd_eq(Simd::splat(0 as _)) | value.simd_eq(Simd::splat(-1 as _))).all()
             }
@@ -66,6 +66,13 @@ macro_rules! impl_element {
             #[inline]
             fn eq(self, other: Self) -> bool { self == other }
 
+            #[inline]
+            fn as_usize(self) -> usize {
+                self as usize
+            }
+
+            type Unsigned = $unsigned;
+
             const TRUE: Self = -1;
             const FALSE: Self = 0;
         }
@@ -75,36 +82,36 @@ macro_rules! impl_element {
     }
 }
 
-impl_element! { i8 }
-impl_element! { i16 }
-impl_element! { i32 }
-impl_element! { i64 }
-impl_element! { isize }
+impl_element! { i8, u8 }
+impl_element! { i16, u16 }
+impl_element! { i32, u32 }
+impl_element! { i64, u64 }
+impl_element! { isize, usize }
 
-/// A SIMD vector mask for `LANES` elements of width specified by `Element`.
+/// A SIMD vector mask for `N` elements of width specified by `Element`.
 ///
-/// Masks represent boolean inclusion/exclusion on a per-lane basis.
+/// Masks represent boolean inclusion/exclusion on a per-element basis.
 ///
 /// The layout of this type is unspecified, and may change between platforms
 /// and/or Rust versions, and code should not assume that it is equivalent to
-/// `[T; LANES]`.
-#[cfg_attr(not(doc), repr(transparent))] // work around https://github.com/rust-lang/rust/issues/90435
-pub struct Mask<T, const LANES: usize>(mask_impl::Mask<T, LANES>)
+/// `[T; N]`.
+#[repr(transparent)]
+pub struct Mask<T, const N: usize>(mask_impl::Mask<T, N>)
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount;
+    LaneCount<N>: SupportedLaneCount;
 
-impl<T, const LANES: usize> Copy for Mask<T, LANES>
+impl<T, const N: usize> Copy for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
 }
 
-impl<T, const LANES: usize> Clone for Mask<T, LANES>
+impl<T, const N: usize> Clone for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn clone(&self) -> Self {
@@ -112,12 +119,12 @@ where
     }
 }
 
-impl<T, const LANES: usize> Mask<T, LANES>
+impl<T, const N: usize> Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    /// Construct a mask by setting all lanes to the given value.
+    /// Construct a mask by setting all elements to the given value.
     #[inline]
     pub fn splat(value: bool) -> Self {
         Self(mask_impl::Mask::splat(value))
@@ -125,7 +132,7 @@ where
 
     /// Converts an array of bools to a SIMD mask.
     #[inline]
-    pub fn from_array(array: [bool; LANES]) -> Self {
+    pub fn from_array(array: [bool; N]) -> Self {
         // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
         //     true:    0b_0000_0001
         //     false:   0b_0000_0000
@@ -133,16 +140,15 @@ where
         // This would be hypothetically valid as an "in-place" transmute,
         // but these are "dependently-sized" types, so copy elision it is!
         unsafe {
-            let bytes: [u8; LANES] = mem::transmute_copy(&array);
-            let bools: Simd<i8, LANES> =
-                intrinsics::simd_ne(Simd::from_array(bytes), Simd::splat(0u8));
+            let bytes: [u8; N] = mem::transmute_copy(&array);
+            let bools: Simd<i8, N> = intrinsics::simd_ne(Simd::from_array(bytes), Simd::splat(0u8));
             Mask::from_int_unchecked(intrinsics::simd_cast(bools))
         }
     }
 
     /// Converts a SIMD mask to an array of bools.
     #[inline]
-    pub fn to_array(self) -> [bool; LANES] {
+    pub fn to_array(self) -> [bool; N] {
         // This follows mostly the same logic as from_array.
         // SAFETY: Rust's bool has a layout of 1 byte (u8) with a value of
         //     true:    0b_0000_0001
@@ -154,7 +160,7 @@ where
         // This would be hypothetically valid as an "in-place" transmute,
         // but these are "dependently-sized" types, so copy elision it is!
         unsafe {
-            let mut bytes: Simd<i8, LANES> = intrinsics::simd_cast(self.to_int());
+            let mut bytes: Simd<i8, N> = intrinsics::simd_cast(self.to_int());
             bytes &= Simd::splat(1i8);
             mem::transmute_copy(&bytes)
         }
@@ -164,10 +170,10 @@ where
     /// represents `true`.
     ///
     /// # Safety
-    /// All lanes must be either 0 or -1.
+    /// All elements must be either 0 or -1.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+    pub unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
         // Safety: the caller must confirm this invariant
         unsafe { Self(mask_impl::Mask::from_int_unchecked(value)) }
     }
@@ -176,11 +182,11 @@ where
     /// represents `true`.
     ///
     /// # Panics
-    /// Panics if any lane is not 0 or -1.
+    /// Panics if any element is not 0 or -1.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     #[track_caller]
-    pub fn from_int(value: Simd<T, LANES>) -> Self {
+    pub fn from_int(value: Simd<T, N>) -> Self {
         assert!(T::valid(value), "all values must be either 0 or -1",);
         // Safety: the validity has been checked
         unsafe { Self::from_int_unchecked(value) }
@@ -190,121 +196,244 @@ where
     /// represents `true`.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_int(self) -> Simd<T, LANES> {
+    pub fn to_int(self) -> Simd<T, N> {
         self.0.to_int()
     }
 
-    /// Converts the mask to a mask of any other lane size.
+    /// Converts the mask to a mask of any other element size.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn cast<U: MaskElement>(self) -> Mask<U, LANES> {
+    pub fn cast<U: MaskElement>(self) -> Mask<U, N> {
         Mask(self.0.convert())
     }
 
-    /// Tests the value of the specified lane.
+    /// Tests the value of the specified element.
     ///
     /// # Safety
-    /// `lane` must be less than `LANES`.
+    /// `index` must be less than `self.len()`.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
-    pub unsafe fn test_unchecked(&self, lane: usize) -> bool {
+    pub unsafe fn test_unchecked(&self, index: usize) -> bool {
         // Safety: the caller must confirm this invariant
-        unsafe { self.0.test_unchecked(lane) }
+        unsafe { self.0.test_unchecked(index) }
     }
 
-    /// Tests the value of the specified lane.
+    /// Tests the value of the specified element.
     ///
     /// # Panics
-    /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
+    /// Panics if `index` is greater than or equal to the number of elements in the vector.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
     #[track_caller]
-    pub fn test(&self, lane: usize) -> bool {
-        assert!(lane < LANES, "lane index out of range");
-        // Safety: the lane index has been checked
-        unsafe { self.test_unchecked(lane) }
+    pub fn test(&self, index: usize) -> bool {
+        assert!(index < N, "element index out of range");
+        // Safety: the element index has been checked
+        unsafe { self.test_unchecked(index) }
     }
 
-    /// Sets the value of the specified lane.
+    /// Sets the value of the specified element.
     ///
     /// # Safety
-    /// `lane` must be less than `LANES`.
+    /// `index` must be less than `self.len()`.
     #[inline]
-    pub unsafe fn set_unchecked(&mut self, lane: usize, value: bool) {
+    pub unsafe fn set_unchecked(&mut self, index: usize, value: bool) {
         // Safety: the caller must confirm this invariant
         unsafe {
-            self.0.set_unchecked(lane, value);
+            self.0.set_unchecked(index, value);
         }
     }
 
-    /// Sets the value of the specified lane.
+    /// Sets the value of the specified element.
     ///
     /// # Panics
-    /// Panics if `lane` is greater than or equal to the number of lanes in the vector.
+    /// Panics if `index` is greater than or equal to the number of elements in the vector.
     #[inline]
     #[track_caller]
-    pub fn set(&mut self, lane: usize, value: bool) {
-        assert!(lane < LANES, "lane index out of range");
-        // Safety: the lane index has been checked
+    pub fn set(&mut self, index: usize, value: bool) {
+        assert!(index < N, "element index out of range");
+        // Safety: the element index has been checked
         unsafe {
-            self.set_unchecked(lane, value);
+            self.set_unchecked(index, value);
         }
     }
 
-    /// Returns true if any lane is set, or false otherwise.
+    /// Returns true if any element is set, or false otherwise.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         self.0.any()
     }
 
-    /// Returns true if all lanes are set, or false otherwise.
+    /// Returns true if all elements are set, or false otherwise.
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn all(self) -> bool {
         self.0.all()
     }
+
+    /// Create a bitmask from a mask.
+    ///
+    /// Each bit is set if the corresponding element in the mask is `true`.
+    /// If the mask contains more than 64 elements, the bitmask is truncated to the first 64.
+    #[inline]
+    #[must_use = "method returns a new integer and does not mutate the original value"]
+    pub fn to_bitmask(self) -> u64 {
+        self.0.to_bitmask_integer()
+    }
+
+    /// Create a mask from a bitmask.
+    ///
+    /// For each bit, if it is set, the corresponding element in the mask is set to `true`.
+    /// If the mask contains more than 64 elements, the remainder are set to `false`.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    pub fn from_bitmask(bitmask: u64) -> Self {
+        Self(mask_impl::Mask::from_bitmask_integer(bitmask))
+    }
+
+    /// Create a bitmask vector from a mask.
+    ///
+    /// Each bit is set if the corresponding element in the mask is `true`.
+    /// The remaining bits are unset.
+    ///
+    /// The bits are packed into the first N bits of the vector:
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x8;
+    /// let mask = mask32x8::from_array([true, false, true, false, false, false, true, false]);
+    /// assert_eq!(mask.to_bitmask_vector()[0], 0b01000101);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new integer and does not mutate the original value"]
+    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
+        self.0.to_bitmask_vector()
+    }
+
+    /// Create a mask from a bitmask vector.
+    ///
+    /// For each bit, if it is set, the corresponding element in the mask is set to `true`.
+    ///
+    /// The bits are packed into the first N bits of the vector:
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{mask32x8, u8x8};
+    /// let bitmask = u8x8::from_array([0b01000101, 0, 0, 0, 0, 0, 0, 0]);
+    /// assert_eq!(
+    ///     mask32x8::from_bitmask_vector(bitmask),
+    ///     mask32x8::from_array([true, false, true, false, false, false, true, false]),
+    /// );
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original value"]
+    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
+        Self(mask_impl::Mask::from_bitmask_vector(bitmask))
+    }
+
+    /// Find the index of the first set element.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x8;
+    /// assert_eq!(mask32x8::splat(false).first_set(), None);
+    /// assert_eq!(mask32x8::splat(true).first_set(), Some(0));
+    ///
+    /// let mask = mask32x8::from_array([false, true, false, false, true, false, false, true]);
+    /// assert_eq!(mask.first_set(), Some(1));
+    /// ```
+    #[inline]
+    #[must_use = "method returns the index and does not mutate the original value"]
+    pub fn first_set(self) -> Option<usize> {
+        // If bitmasks are efficient, using them is better
+        if cfg!(target_feature = "sse") && N <= 64 {
+            let tz = self.to_bitmask().trailing_zeros();
+            return if tz == 64 { None } else { Some(tz as usize) };
+        }
+
+        // To find the first set index:
+        // * create a vector 0..N
+        // * replace unset mask elements in that vector with -1
+        // * perform _unsigned_ reduce-min
+        // * check if the result is -1 or an index
+
+        let index = Simd::from_array(
+            const {
+                let mut index = [0; N];
+                let mut i = 0;
+                while i < N {
+                    index[i] = i;
+                    i += 1;
+                }
+                index
+            },
+        );
+
+        // Safety: the input and output are integer vectors
+        let index: Simd<T, N> = unsafe { intrinsics::simd_cast(index) };
+
+        let masked_index = self.select(index, Self::splat(true).to_int());
+
+        // Safety: the input and output are integer vectors
+        let masked_index: Simd<T::Unsigned, N> = unsafe { intrinsics::simd_cast(masked_index) };
+
+        // Safety: the input is an integer vector
+        let min_index: T::Unsigned = unsafe { intrinsics::simd_reduce_min(masked_index) };
+
+        // Safety: the return value is the unsigned version of T
+        let min_index: T = unsafe { core::mem::transmute_copy(&min_index) };
+
+        if min_index.eq(T::TRUE) {
+            None
+        } else {
+            Some(min_index.as_usize())
+        }
+    }
 }
 
 // vector/array conversion
-impl<T, const LANES: usize> From<[bool; LANES]> for Mask<T, LANES>
+impl<T, const N: usize> From<[bool; N]> for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    fn from(array: [bool; LANES]) -> Self {
+    fn from(array: [bool; N]) -> Self {
         Self::from_array(array)
     }
 }
 
-impl<T, const LANES: usize> From<Mask<T, LANES>> for [bool; LANES]
+impl<T, const N: usize> From<Mask<T, N>> for [bool; N]
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    fn from(vector: Mask<T, LANES>) -> Self {
+    fn from(vector: Mask<T, N>) -> Self {
         vector.to_array()
     }
 }
 
-impl<T, const LANES: usize> Default for Mask<T, LANES>
+impl<T, const N: usize> Default for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    #[must_use = "method returns a defaulted mask with all lanes set to false (0)"]
+    #[must_use = "method returns a defaulted mask with all elements set to false (0)"]
     fn default() -> Self {
         Self::splat(false)
     }
 }
 
-impl<T, const LANES: usize> PartialEq for Mask<T, LANES>
+impl<T, const N: usize> PartialEq for Mask<T, N>
 where
     T: MaskElement + PartialEq,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     #[must_use = "method returns a new bool and does not mutate the original value"]
@@ -313,10 +442,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> PartialOrd for Mask<T, LANES>
+impl<T, const N: usize> PartialOrd for Mask<T, N>
 where
     T: MaskElement + PartialOrd,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     #[must_use = "method returns a new Ordering and does not mutate the original value"]
@@ -325,23 +454,23 @@ where
     }
 }
 
-impl<T, const LANES: usize> fmt::Debug for Mask<T, LANES>
+impl<T, const N: usize> fmt::Debug for Mask<T, N>
 where
     T: MaskElement + fmt::Debug,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_list()
-            .entries((0..LANES).map(|lane| self.test(lane)))
+            .entries((0..N).map(|i| self.test(i)))
             .finish()
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitAnd for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitAnd for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -351,10 +480,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitAnd<bool> for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitAnd<bool> for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -364,23 +493,23 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitAnd<Mask<T, LANES>> for bool
+impl<T, const N: usize> core::ops::BitAnd<Mask<T, N>> for bool
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Output = Mask<T, LANES>;
+    type Output = Mask<T, N>;
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    fn bitand(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
+    fn bitand(self, rhs: Mask<T, N>) -> Mask<T, N> {
         Mask::splat(self) & rhs
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitOr for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitOr for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -390,10 +519,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitOr<bool> for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitOr<bool> for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -403,23 +532,23 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitOr<Mask<T, LANES>> for bool
+impl<T, const N: usize> core::ops::BitOr<Mask<T, N>> for bool
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Output = Mask<T, LANES>;
+    type Output = Mask<T, N>;
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    fn bitor(self, rhs: Mask<T, LANES>) -> Mask<T, LANES> {
+    fn bitor(self, rhs: Mask<T, N>) -> Mask<T, N> {
         Mask::splat(self) | rhs
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitXor for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitXor for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -429,10 +558,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitXor<bool> for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitXor<bool> for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -442,25 +571,25 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitXor<Mask<T, LANES>> for bool
+impl<T, const N: usize> core::ops::BitXor<Mask<T, N>> for bool
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Output = Mask<T, LANES>;
+    type Output = Mask<T, N>;
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    fn bitxor(self, rhs: Mask<T, LANES>) -> Self::Output {
+    fn bitxor(self, rhs: Mask<T, N>) -> Self::Output {
         Mask::splat(self) ^ rhs
     }
 }
 
-impl<T, const LANES: usize> core::ops::Not for Mask<T, LANES>
+impl<T, const N: usize> core::ops::Not for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Output = Mask<T, LANES>;
+    type Output = Mask<T, N>;
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn not(self) -> Self::Output {
@@ -468,10 +597,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitAndAssign for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitAndAssign for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn bitand_assign(&mut self, rhs: Self) {
@@ -479,10 +608,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitAndAssign<bool> for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitAndAssign<bool> for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn bitand_assign(&mut self, rhs: bool) {
@@ -490,10 +619,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitOrAssign for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitOrAssign for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn bitor_assign(&mut self, rhs: Self) {
@@ -501,10 +630,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitOrAssign<bool> for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitOrAssign<bool> for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn bitor_assign(&mut self, rhs: bool) {
@@ -512,10 +641,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitXorAssign for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitXorAssign for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn bitxor_assign(&mut self, rhs: Self) {
@@ -523,10 +652,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitXorAssign<bool> for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitXorAssign<bool> for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn bitxor_assign(&mut self, rhs: bool) {
@@ -537,12 +666,12 @@ where
 macro_rules! impl_from {
     { $from:ty  => $($to:ty),* } => {
         $(
-        impl<const LANES: usize> From<Mask<$from, LANES>> for Mask<$to, LANES>
+        impl<const N: usize> From<Mask<$from, N>> for Mask<$to, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
-            fn from(value: Mask<$from, LANES>) -> Self {
+            fn from(value: Mask<$from, N>) -> Self {
                 value.cast()
             }
         }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 20465ba9b07ec..6ddff07fea254 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -1,30 +1,30 @@
 #![allow(unused_imports)]
 use super::MaskElement;
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount, ToBitMask};
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
 use core::marker::PhantomData;
 
 /// A mask where each lane is represented by a single bit.
 #[repr(transparent)]
-pub struct Mask<T, const LANES: usize>(
-    <LaneCount<LANES> as SupportedLaneCount>::BitMask,
+pub struct Mask<T, const N: usize>(
+    <LaneCount<N> as SupportedLaneCount>::BitMask,
     PhantomData<T>,
 )
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount;
+    LaneCount<N>: SupportedLaneCount;
 
-impl<T, const LANES: usize> Copy for Mask<T, LANES>
+impl<T, const N: usize> Copy for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
 }
 
-impl<T, const LANES: usize> Clone for Mask<T, LANES>
+impl<T, const N: usize> Clone for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn clone(&self) -> Self {
@@ -32,10 +32,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> PartialEq for Mask<T, LANES>
+impl<T, const N: usize> PartialEq for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn eq(&self, other: &Self) -> bool {
@@ -43,10 +43,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> PartialOrd for Mask<T, LANES>
+impl<T, const N: usize> PartialOrd for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
@@ -54,17 +54,17 @@ where
     }
 }
 
-impl<T, const LANES: usize> Eq for Mask<T, LANES>
+impl<T, const N: usize> Eq for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
 }
 
-impl<T, const LANES: usize> Ord for Mask<T, LANES>
+impl<T, const N: usize> Ord for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn cmp(&self, other: &Self) -> core::cmp::Ordering {
@@ -72,22 +72,22 @@ where
     }
 }
 
-impl<T, const LANES: usize> Mask<T, LANES>
+impl<T, const N: usize> Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub fn splat(value: bool) -> Self {
-        let mut mask = <LaneCount<LANES> as SupportedLaneCount>::BitMask::default();
+        let mut mask = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
         if value {
             mask.as_mut().fill(u8::MAX)
         } else {
             mask.as_mut().fill(u8::MIN)
         }
-        if LANES % 8 > 0 {
-            *mask.as_mut().last_mut().unwrap() &= u8::MAX >> (8 - LANES % 8);
+        if N % 8 > 0 {
+            *mask.as_mut().last_mut().unwrap() &= u8::MAX >> (8 - N % 8);
         }
         Self(mask, PhantomData)
     }
@@ -107,7 +107,7 @@ where
 
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_int(self) -> Simd<T, LANES> {
+    pub fn to_int(self) -> Simd<T, N> {
         unsafe {
             intrinsics::simd_select_bitmask(self.0, Simd::splat(T::TRUE), Simd::splat(T::FALSE))
         }
@@ -115,51 +115,47 @@ where
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+    pub unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
         unsafe { Self(intrinsics::simd_bitmask(value), PhantomData) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask_array<const N: usize>(self) -> [u8; N] {
-        assert!(core::mem::size_of::<Self>() == N);
-
-        // Safety: converting an integer to an array of bytes of the same size is safe
-        unsafe { core::mem::transmute_copy(&self.0) }
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
+        let mut bitmask = Simd::splat(0);
+        bitmask.as_mut_array()[..self.0.as_ref().len()].copy_from_slice(self.0.as_ref());
+        bitmask
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_array<const N: usize>(bitmask: [u8; N]) -> Self {
-        assert!(core::mem::size_of::<Self>() == N);
-
-        // Safety: converting an array of bytes to an integer of the same size is safe
-        Self(unsafe { core::mem::transmute_copy(&bitmask) }, PhantomData)
+    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
+        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
+        let len = bytes.as_ref().len();
+        bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
+        Self(bytes, PhantomData)
     }
 
     #[inline]
-    pub fn to_bitmask_integer<U>(self) -> U
-    where
-        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
-    {
-        // Safety: these are the same types
-        unsafe { core::mem::transmute_copy(&self.0) }
+    pub fn to_bitmask_integer(self) -> u64 {
+        let mut bitmask = [0u8; 8];
+        bitmask[..self.0.as_ref().len()].copy_from_slice(self.0.as_ref());
+        u64::from_ne_bytes(bitmask)
     }
 
     #[inline]
-    pub fn from_bitmask_integer<U>(bitmask: U) -> Self
-    where
-        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
-    {
-        // Safety: these are the same types
-        unsafe { Self(core::mem::transmute_copy(&bitmask), PhantomData) }
+    pub fn from_bitmask_integer(bitmask: u64) -> Self {
+        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
+        let len = bytes.as_mut().len();
+        bytes
+            .as_mut()
+            .copy_from_slice(&bitmask.to_ne_bytes()[..len]);
+        Self(bytes, PhantomData)
     }
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn convert<U>(self) -> Mask<U, LANES>
+    pub fn convert<U>(self) -> Mask<U, N>
     where
         U: MaskElement,
     {
@@ -180,11 +176,11 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitAnd for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitAnd for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
-    <LaneCount<LANES> as SupportedLaneCount>::BitMask: AsRef<[u8]> + AsMut<[u8]>,
+    LaneCount<N>: SupportedLaneCount,
+    <LaneCount<N> as SupportedLaneCount>::BitMask: AsRef<[u8]> + AsMut<[u8]>,
 {
     type Output = Self;
     #[inline]
@@ -197,11 +193,11 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitOr for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitOr for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
-    <LaneCount<LANES> as SupportedLaneCount>::BitMask: AsRef<[u8]> + AsMut<[u8]>,
+    LaneCount<N>: SupportedLaneCount,
+    <LaneCount<N> as SupportedLaneCount>::BitMask: AsRef<[u8]> + AsMut<[u8]>,
 {
     type Output = Self;
     #[inline]
@@ -214,10 +210,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitXor for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitXor for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -230,10 +226,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::Not for Mask<T, LANES>
+impl<T, const N: usize> core::ops::Not for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -242,8 +238,8 @@ where
         for x in self.0.as_mut() {
             *x = !*x;
         }
-        if LANES % 8 > 0 {
-            *self.0.as_mut().last_mut().unwrap() &= u8::MAX >> (8 - LANES % 8);
+        if N % 8 > 0 {
+            *self.0.as_mut().last_mut().unwrap() &= u8::MAX >> (8 - N % 8);
         }
         self
     }
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 1d13c45b8e70b..63964f455e05c 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -1,29 +1,25 @@
 //! Masks that take up full SIMD vector registers.
 
-use super::MaskElement;
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SupportedLaneCount, ToBitMask};
-
-#[cfg(feature = "generic_const_exprs")]
-use crate::simd::ToBitMaskArray;
+use crate::simd::{LaneCount, MaskElement, Simd, SupportedLaneCount};
 
 #[repr(transparent)]
-pub struct Mask<T, const LANES: usize>(Simd<T, LANES>)
+pub struct Mask<T, const N: usize>(Simd<T, N>)
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount;
+    LaneCount<N>: SupportedLaneCount;
 
-impl<T, const LANES: usize> Copy for Mask<T, LANES>
+impl<T, const N: usize> Copy for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
 }
 
-impl<T, const LANES: usize> Clone for Mask<T, LANES>
+impl<T, const N: usize> Clone for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
@@ -32,10 +28,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> PartialEq for Mask<T, LANES>
+impl<T, const N: usize> PartialEq for Mask<T, N>
 where
     T: MaskElement + PartialEq,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn eq(&self, other: &Self) -> bool {
@@ -43,10 +39,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> PartialOrd for Mask<T, LANES>
+impl<T, const N: usize> PartialOrd for Mask<T, N>
 where
     T: MaskElement + PartialOrd,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
@@ -54,17 +50,17 @@ where
     }
 }
 
-impl<T, const LANES: usize> Eq for Mask<T, LANES>
+impl<T, const N: usize> Eq for Mask<T, N>
 where
     T: MaskElement + Eq,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
 }
 
-impl<T, const LANES: usize> Ord for Mask<T, LANES>
+impl<T, const N: usize> Ord for Mask<T, N>
 where
     T: MaskElement + Ord,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn cmp(&self, other: &Self) -> core::cmp::Ordering {
@@ -101,10 +97,10 @@ macro_rules! impl_reverse_bits {
 
 impl_reverse_bits! { u8, u16, u32, u64 }
 
-impl<T, const LANES: usize> Mask<T, LANES>
+impl<T, const N: usize> Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
@@ -125,19 +121,19 @@ where
 
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_int(self) -> Simd<T, LANES> {
+    pub fn to_int(self) -> Simd<T, N> {
         self.0
     }
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub unsafe fn from_int_unchecked(value: Simd<T, LANES>) -> Self {
+    pub unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
         Self(value)
     }
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn convert<U>(self) -> Mask<U, LANES>
+    pub fn convert<U>(self) -> Mask<U, N>
     where
         U: MaskElement,
     {
@@ -145,62 +141,50 @@ where
         unsafe { Mask(intrinsics::simd_cast(self.0)) }
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
-    #[must_use = "method returns a new array and does not mutate the original value"]
-    pub fn to_bitmask_array<const N: usize>(self) -> [u8; N]
-    where
-        super::Mask<T, LANES>: ToBitMaskArray,
-        [(); <super::Mask<T, LANES> as ToBitMaskArray>::BYTES]: Sized,
-    {
-        assert_eq!(<super::Mask<T, LANES> as ToBitMaskArray>::BYTES, N);
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
+        let mut bitmask = Simd::splat(0);
 
-        // Safety: N is the correct bitmask size
+        // Safety: Bytes is the right size array
         unsafe {
             // Compute the bitmask
-            let bitmask: [u8; <super::Mask<T, LANES> as ToBitMaskArray>::BYTES] =
+            let mut bytes: <LaneCount<N> as SupportedLaneCount>::BitMask =
                 intrinsics::simd_bitmask(self.0);
 
-            // Transmute to the return type, previously asserted to be the same size
-            let mut bitmask: [u8; N] = core::mem::transmute_copy(&bitmask);
-
             // LLVM assumes bit order should match endianness
             if cfg!(target_endian = "big") {
-                for x in bitmask.as_mut() {
-                    *x = x.reverse_bits();
+                for x in bytes.as_mut() {
+                    *x = x.reverse_bits()
                 }
-            };
+            }
 
-            bitmask
+            bitmask.as_mut_array()[..bytes.as_ref().len()].copy_from_slice(bytes.as_ref());
         }
+
+        bitmask
     }
 
-    #[cfg(feature = "generic_const_exprs")]
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_array<const N: usize>(mut bitmask: [u8; N]) -> Self
-    where
-        super::Mask<T, LANES>: ToBitMaskArray,
-        [(); <super::Mask<T, LANES> as ToBitMaskArray>::BYTES]: Sized,
-    {
-        assert_eq!(<super::Mask<T, LANES> as ToBitMaskArray>::BYTES, N);
+    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
+        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
 
-        // Safety: N is the correct bitmask size
+        // Safety: Bytes is the right size array
         unsafe {
+            let len = bytes.as_ref().len();
+            bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
+
             // LLVM assumes bit order should match endianness
             if cfg!(target_endian = "big") {
-                for x in bitmask.as_mut() {
+                for x in bytes.as_mut() {
                     *x = x.reverse_bits();
                 }
             }
 
-            // Transmute to the bitmask type, previously asserted to be the same size
-            let bitmask: [u8; <super::Mask<T, LANES> as ToBitMaskArray>::BYTES] =
-                core::mem::transmute_copy(&bitmask);
-
             // Compute the regular mask
             Self::from_int_unchecked(intrinsics::simd_select_bitmask(
-                bitmask,
+                bytes,
                 Self::splat(true).to_int(),
                 Self::splat(false).to_int(),
             ))
@@ -208,40 +192,81 @@ where
     }
 
     #[inline]
-    pub(crate) fn to_bitmask_integer<U: ReverseBits>(self) -> U
+    unsafe fn to_bitmask_impl<U: ReverseBits, const M: usize>(self) -> U
     where
-        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+        LaneCount<M>: SupportedLaneCount,
     {
-        // Safety: U is required to be the appropriate bitmask type
-        let bitmask: U = unsafe { intrinsics::simd_bitmask(self.0) };
+        let resized = self.to_int().resize::<M>(T::FALSE);
+
+        // Safety: `resized` is an integer vector with length M, which must match T
+        let bitmask: U = unsafe { intrinsics::simd_bitmask(resized) };
 
         // LLVM assumes bit order should match endianness
         if cfg!(target_endian = "big") {
-            bitmask.reverse_bits(LANES)
+            bitmask.reverse_bits(M)
         } else {
             bitmask
         }
     }
 
     #[inline]
-    pub(crate) fn from_bitmask_integer<U: ReverseBits>(bitmask: U) -> Self
+    unsafe fn from_bitmask_impl<U: ReverseBits, const M: usize>(bitmask: U) -> Self
     where
-        super::Mask<T, LANES>: ToBitMask<BitMask = U>,
+        LaneCount<M>: SupportedLaneCount,
     {
         // LLVM assumes bit order should match endianness
         let bitmask = if cfg!(target_endian = "big") {
-            bitmask.reverse_bits(LANES)
+            bitmask.reverse_bits(M)
         } else {
             bitmask
         };
 
-        // Safety: U is required to be the appropriate bitmask type
-        unsafe {
-            Self::from_int_unchecked(intrinsics::simd_select_bitmask(
+        // SAFETY: `mask` is the correct bitmask type for a u64 bitmask
+        let mask: Simd<T, M> = unsafe {
+            intrinsics::simd_select_bitmask(
                 bitmask,
-                Self::splat(true).to_int(),
-                Self::splat(false).to_int(),
-            ))
+                Simd::<T, M>::splat(T::TRUE),
+                Simd::<T, M>::splat(T::FALSE),
+            )
+        };
+
+        // SAFETY: `mask` only contains `T::TRUE` or `T::FALSE`
+        unsafe { Self::from_int_unchecked(mask.resize::<N>(T::FALSE)) }
+    }
+
+    #[inline]
+    pub(crate) fn to_bitmask_integer(self) -> u64 {
+        // TODO modify simd_bitmask to zero-extend output, making this unnecessary
+        if N <= 8 {
+            // Safety: bitmask matches length
+            unsafe { self.to_bitmask_impl::<u8, 8>() as u64 }
+        } else if N <= 16 {
+            // Safety: bitmask matches length
+            unsafe { self.to_bitmask_impl::<u16, 16>() as u64 }
+        } else if N <= 32 {
+            // Safety: bitmask matches length
+            unsafe { self.to_bitmask_impl::<u32, 32>() as u64 }
+        } else {
+            // Safety: bitmask matches length
+            unsafe { self.to_bitmask_impl::<u64, 64>() }
+        }
+    }
+
+    #[inline]
+    pub(crate) fn from_bitmask_integer(bitmask: u64) -> Self {
+        // TODO modify simd_bitmask_select to truncate input, making this unnecessary
+        if N <= 8 {
+            // Safety: bitmask matches length
+            unsafe { Self::from_bitmask_impl::<u8, 8>(bitmask as u8) }
+        } else if N <= 16 {
+            // Safety: bitmask matches length
+            unsafe { Self::from_bitmask_impl::<u16, 16>(bitmask as u16) }
+        } else if N <= 32 {
+            // Safety: bitmask matches length
+            unsafe { Self::from_bitmask_impl::<u32, 32>(bitmask as u32) }
+        } else {
+            // Safety: bitmask matches length
+            unsafe { Self::from_bitmask_impl::<u64, 64>(bitmask) }
         }
     }
 
@@ -260,21 +285,21 @@ where
     }
 }
 
-impl<T, const LANES: usize> From<Mask<T, LANES>> for Simd<T, LANES>
+impl<T, const N: usize> From<Mask<T, N>> for Simd<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
-    fn from(value: Mask<T, LANES>) -> Self {
+    fn from(value: Mask<T, N>) -> Self {
         value.0
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitAnd for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitAnd for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -285,10 +310,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitOr for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitOr for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -299,10 +324,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::BitXor for Mask<T, LANES>
+impl<T, const N: usize> core::ops::BitXor for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
@@ -313,10 +338,10 @@ where
     }
 }
 
-impl<T, const LANES: usize> core::ops::Not for Mask<T, LANES>
+impl<T, const N: usize> core::ops::Not for Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     type Output = Self;
     #[inline]
diff --git a/crates/core_simd/src/masks/to_bitmask.rs b/crates/core_simd/src/masks/to_bitmask.rs
deleted file mode 100644
index fc7d6b781f2f5..0000000000000
--- a/crates/core_simd/src/masks/to_bitmask.rs
+++ /dev/null
@@ -1,97 +0,0 @@
-use super::{mask_impl, Mask, MaskElement};
-use crate::simd::{LaneCount, SupportedLaneCount};
-
-mod sealed {
-    pub trait Sealed {}
-}
-pub use sealed::Sealed;
-
-impl<T, const LANES: usize> Sealed for Mask<T, LANES>
-where
-    T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
-{
-}
-
-/// Converts masks to and from integer bitmasks.
-///
-/// Each bit of the bitmask corresponds to a mask lane, starting with the LSB.
-pub trait ToBitMask: Sealed {
-    /// The integer bitmask type.
-    type BitMask;
-
-    /// Converts a mask to a bitmask.
-    fn to_bitmask(self) -> Self::BitMask;
-
-    /// Converts a bitmask to a mask.
-    fn from_bitmask(bitmask: Self::BitMask) -> Self;
-}
-
-/// Converts masks to and from byte array bitmasks.
-///
-/// Each bit of the bitmask corresponds to a mask lane, starting with the LSB of the first byte.
-#[cfg(feature = "generic_const_exprs")]
-pub trait ToBitMaskArray: Sealed {
-    /// The length of the bitmask array.
-    const BYTES: usize;
-
-    /// Converts a mask to a bitmask.
-    fn to_bitmask_array(self) -> [u8; Self::BYTES];
-
-    /// Converts a bitmask to a mask.
-    fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self;
-}
-
-macro_rules! impl_integer_intrinsic {
-    { $(impl ToBitMask<BitMask=$int:ty> for Mask<_, $lanes:literal>)* } => {
-        $(
-        impl<T: MaskElement> ToBitMask for Mask<T, $lanes> {
-            type BitMask = $int;
-
-            #[inline]
-            fn to_bitmask(self) -> $int {
-                self.0.to_bitmask_integer()
-            }
-
-            #[inline]
-            fn from_bitmask(bitmask: $int) -> Self {
-                Self(mask_impl::Mask::from_bitmask_integer(bitmask))
-            }
-        }
-        )*
-    }
-}
-
-impl_integer_intrinsic! {
-    impl ToBitMask<BitMask=u8> for Mask<_, 1>
-    impl ToBitMask<BitMask=u8> for Mask<_, 2>
-    impl ToBitMask<BitMask=u8> for Mask<_, 4>
-    impl ToBitMask<BitMask=u8> for Mask<_, 8>
-    impl ToBitMask<BitMask=u16> for Mask<_, 16>
-    impl ToBitMask<BitMask=u32> for Mask<_, 32>
-    impl ToBitMask<BitMask=u64> for Mask<_, 64>
-}
-
-/// Returns the minimum number of bytes in a bitmask with `lanes` lanes.
-#[cfg(feature = "generic_const_exprs")]
-pub const fn bitmask_len(lanes: usize) -> usize {
-    (lanes + 7) / 8
-}
-
-#[cfg(feature = "generic_const_exprs")]
-impl<T: MaskElement, const LANES: usize> ToBitMaskArray for Mask<T, LANES>
-where
-    LaneCount<LANES>: SupportedLaneCount,
-{
-    const BYTES: usize = bitmask_len(LANES);
-
-    #[inline]
-    fn to_bitmask_array(self) -> [u8; Self::BYTES] {
-        self.0.to_bitmask_array()
-    }
-
-    #[inline]
-    fn from_bitmask_array(bitmask: [u8; Self::BYTES]) -> Self {
-        Mask(mask_impl::Mask::from_bitmask_array(bitmask))
-    }
-}
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index 19426769858b0..fd016f1c6f7ab 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -3,37 +3,37 @@ mod swizzle;
 
 pub(crate) mod intrinsics;
 
-#[cfg(feature = "generic_const_exprs")]
-mod to_bytes;
-
 mod alias;
 mod cast;
-mod elements;
-mod eq;
 mod fmt;
 mod iter;
 mod lane_count;
 mod masks;
 mod ops;
-mod ord;
 mod select;
 mod swizzle_dyn;
+mod to_bytes;
 mod vector;
 mod vendor;
 
-#[doc = include_str!("core_simd_docs.md")]
 pub mod simd {
+    #![doc = include_str!("core_simd_docs.md")]
+
     pub mod prelude;
 
+    pub mod num;
+
+    pub mod ptr;
+
+    pub mod cmp;
+
     pub(crate) use crate::core_simd::intrinsics;
 
     pub use crate::core_simd::alias::*;
     pub use crate::core_simd::cast::*;
-    pub use crate::core_simd::elements::*;
-    pub use crate::core_simd::eq::*;
     pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
     pub use crate::core_simd::masks::*;
-    pub use crate::core_simd::ord::*;
     pub use crate::core_simd::swizzle::*;
+    pub use crate::core_simd::to_bytes::ToBytes;
     pub use crate::core_simd::vector::*;
 }
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index b007456cf2cc5..8a1b083f03982 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -1,4 +1,4 @@
-use crate::simd::{LaneCount, Simd, SimdElement, SimdPartialEq, SupportedLaneCount};
+use crate::simd::{cmp::SimdPartialEq, LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::ops::{Add, Mul};
 use core::ops::{BitAnd, BitOr, BitXor};
 use core::ops::{Div, Rem, Sub};
@@ -6,12 +6,13 @@ use core::ops::{Shl, Shr};
 
 mod assign;
 mod deref;
+mod shift_scalar;
 mod unary;
 
-impl<I, T, const LANES: usize> core::ops::Index<I> for Simd<T, LANES>
+impl<I, T, const N: usize> core::ops::Index<I> for Simd<T, N>
 where
     T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     I: core::slice::SliceIndex<[T]>,
 {
     type Output = I::Output;
@@ -21,10 +22,10 @@ where
     }
 }
 
-impl<I, T, const LANES: usize> core::ops::IndexMut<I> for Simd<T, LANES>
+impl<I, T, const N: usize> core::ops::IndexMut<I> for Simd<T, N>
 where
     T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
     I: core::slice::SliceIndex<[T]>,
 {
     #[inline]
diff --git a/crates/core_simd/src/ops/assign.rs b/crates/core_simd/src/ops/assign.rs
index d2b48614fc966..0e87785025a38 100644
--- a/crates/core_simd/src/ops/assign.rs
+++ b/crates/core_simd/src/ops/assign.rs
@@ -8,7 +8,7 @@ use core::ops::{ShlAssign, ShrAssign}; // non-commutative bit binary op-assignme
 // Arithmetic
 
 macro_rules! assign_ops {
-    ($(impl<T, U, const LANES: usize> $assignTrait:ident<U> for Simd<T, LANES>
+    ($(impl<T, U, const N: usize> $assignTrait:ident<U> for Simd<T, N>
         where
             Self: $trait:ident,
         {
@@ -16,11 +16,11 @@ macro_rules! assign_ops {
                 $call:ident
             }
         })*) => {
-        $(impl<T, U, const LANES: usize> $assignTrait<U> for Simd<T, LANES>
+        $(impl<T, U, const N: usize> $assignTrait<U> for Simd<T, N>
         where
             Self: $trait<U, Output = Self>,
             T: SimdElement,
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn $assign_call(&mut self, rhs: U) {
@@ -32,7 +32,7 @@ macro_rules! assign_ops {
 
 assign_ops! {
     // Arithmetic
-    impl<T, U, const LANES: usize> AddAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> AddAssign<U> for Simd<T, N>
     where
         Self: Add,
     {
@@ -41,7 +41,7 @@ assign_ops! {
         }
     }
 
-    impl<T, U, const LANES: usize> MulAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> MulAssign<U> for Simd<T, N>
     where
         Self: Mul,
     {
@@ -50,7 +50,7 @@ assign_ops! {
         }
     }
 
-    impl<T, U, const LANES: usize> SubAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> SubAssign<U> for Simd<T, N>
     where
         Self: Sub,
     {
@@ -59,7 +59,7 @@ assign_ops! {
         }
     }
 
-    impl<T, U, const LANES: usize> DivAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> DivAssign<U> for Simd<T, N>
     where
         Self: Div,
     {
@@ -67,7 +67,7 @@ assign_ops! {
             div
         }
     }
-    impl<T, U, const LANES: usize> RemAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> RemAssign<U> for Simd<T, N>
     where
         Self: Rem,
     {
@@ -77,7 +77,7 @@ assign_ops! {
     }
 
     // Bitops
-    impl<T, U, const LANES: usize> BitAndAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> BitAndAssign<U> for Simd<T, N>
     where
         Self: BitAnd,
     {
@@ -86,7 +86,7 @@ assign_ops! {
         }
     }
 
-    impl<T, U, const LANES: usize> BitOrAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> BitOrAssign<U> for Simd<T, N>
     where
         Self: BitOr,
     {
@@ -95,7 +95,7 @@ assign_ops! {
         }
     }
 
-    impl<T, U, const LANES: usize> BitXorAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> BitXorAssign<U> for Simd<T, N>
     where
         Self: BitXor,
     {
@@ -104,7 +104,7 @@ assign_ops! {
         }
     }
 
-    impl<T, U, const LANES: usize> ShlAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> ShlAssign<U> for Simd<T, N>
     where
         Self: Shl,
     {
@@ -113,7 +113,7 @@ assign_ops! {
         }
     }
 
-    impl<T, U, const LANES: usize> ShrAssign<U> for Simd<T, LANES>
+    impl<T, U, const N: usize> ShrAssign<U> for Simd<T, N>
     where
         Self: Shr,
     {
diff --git a/crates/core_simd/src/ops/deref.rs b/crates/core_simd/src/ops/deref.rs
index 302bf148bd3ef..89a60ba114146 100644
--- a/crates/core_simd/src/ops/deref.rs
+++ b/crates/core_simd/src/ops/deref.rs
@@ -5,16 +5,16 @@
 use super::*;
 
 macro_rules! deref_lhs {
-    (impl<T, const LANES: usize> $trait:ident for $simd:ty {
+    (impl<T, const N: usize> $trait:ident for $simd:ty {
             fn $call:ident
         }) => {
-        impl<T, const LANES: usize> $trait<$simd> for &$simd
+        impl<T, const N: usize> $trait<$simd> for &$simd
         where
             T: SimdElement,
             $simd: $trait<$simd, Output = $simd>,
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
-            type Output = Simd<T, LANES>;
+            type Output = Simd<T, N>;
 
             #[inline]
             #[must_use = "operator returns a new vector without mutating the inputs"]
@@ -26,16 +26,16 @@ macro_rules! deref_lhs {
 }
 
 macro_rules! deref_rhs {
-    (impl<T, const LANES: usize> $trait:ident for $simd:ty {
+    (impl<T, const N: usize> $trait:ident for $simd:ty {
             fn $call:ident
         }) => {
-        impl<T, const LANES: usize> $trait<&$simd> for $simd
+        impl<T, const N: usize> $trait<&$simd> for $simd
         where
             T: SimdElement,
             $simd: $trait<$simd, Output = $simd>,
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
-            type Output = Simd<T, LANES>;
+            type Output = Simd<T, N>;
 
             #[inline]
             #[must_use = "operator returns a new vector without mutating the inputs"]
@@ -47,25 +47,25 @@ macro_rules! deref_rhs {
 }
 
 macro_rules! deref_ops {
-    ($(impl<T, const LANES: usize> $trait:ident for $simd:ty {
+    ($(impl<T, const N: usize> $trait:ident for $simd:ty {
             fn $call:ident
         })*) => {
         $(
             deref_rhs! {
-                impl<T, const LANES: usize> $trait for $simd {
+                impl<T, const N: usize> $trait for $simd {
                     fn $call
                 }
             }
             deref_lhs! {
-                impl<T, const LANES: usize> $trait for $simd {
+                impl<T, const N: usize> $trait for $simd {
                     fn $call
                 }
             }
-            impl<'lhs, 'rhs, T, const LANES: usize> $trait<&'rhs $simd> for &'lhs $simd
+            impl<'lhs, 'rhs, T, const N: usize> $trait<&'rhs $simd> for &'lhs $simd
             where
                 T: SimdElement,
                 $simd: $trait<$simd, Output = $simd>,
-                LaneCount<LANES>: SupportedLaneCount,
+                LaneCount<N>: SupportedLaneCount,
             {
                 type Output = $simd;
 
@@ -81,44 +81,44 @@ macro_rules! deref_ops {
 
 deref_ops! {
     // Arithmetic
-    impl<T, const LANES: usize> Add for Simd<T, LANES> {
+    impl<T, const N: usize> Add for Simd<T, N> {
         fn add
     }
 
-    impl<T, const LANES: usize> Mul for Simd<T, LANES> {
+    impl<T, const N: usize> Mul for Simd<T, N> {
         fn mul
     }
 
-    impl<T, const LANES: usize> Sub for Simd<T, LANES> {
+    impl<T, const N: usize> Sub for Simd<T, N> {
         fn sub
     }
 
-    impl<T, const LANES: usize> Div for Simd<T, LANES> {
+    impl<T, const N: usize> Div for Simd<T, N> {
         fn div
     }
 
-    impl<T, const LANES: usize> Rem for Simd<T, LANES> {
+    impl<T, const N: usize> Rem for Simd<T, N> {
         fn rem
     }
 
     // Bitops
-    impl<T, const LANES: usize> BitAnd for Simd<T, LANES> {
+    impl<T, const N: usize> BitAnd for Simd<T, N> {
         fn bitand
     }
 
-    impl<T, const LANES: usize> BitOr for Simd<T, LANES> {
+    impl<T, const N: usize> BitOr for Simd<T, N> {
         fn bitor
     }
 
-    impl<T, const LANES: usize> BitXor for Simd<T, LANES> {
+    impl<T, const N: usize> BitXor for Simd<T, N> {
         fn bitxor
     }
 
-    impl<T, const LANES: usize> Shl for Simd<T, LANES> {
+    impl<T, const N: usize> Shl for Simd<T, N> {
         fn shl
     }
 
-    impl<T, const LANES: usize> Shr for Simd<T, LANES> {
+    impl<T, const N: usize> Shr for Simd<T, N> {
         fn shr
     }
 }
diff --git a/crates/core_simd/src/ops/shift_scalar.rs b/crates/core_simd/src/ops/shift_scalar.rs
new file mode 100644
index 0000000000000..f5115a5a5e935
--- /dev/null
+++ b/crates/core_simd/src/ops/shift_scalar.rs
@@ -0,0 +1,62 @@
+// Shift operations uniquely typically only have a scalar on the right-hand side.
+// Here, we implement shifts for scalar RHS arguments.
+
+use crate::simd::{LaneCount, Simd, SupportedLaneCount};
+
+macro_rules! impl_splatted_shifts {
+    { impl $trait:ident :: $trait_fn:ident for $ty:ty } => {
+        impl<const N: usize> core::ops::$trait<$ty> for Simd<$ty, N>
+        where
+            LaneCount<N>: SupportedLaneCount,
+        {
+            type Output = Self;
+            #[inline]
+            fn $trait_fn(self, rhs: $ty) -> Self::Output {
+                self.$trait_fn(Simd::splat(rhs))
+            }
+        }
+
+        impl<const N: usize> core::ops::$trait<&$ty> for Simd<$ty, N>
+        where
+            LaneCount<N>: SupportedLaneCount,
+        {
+            type Output = Self;
+            #[inline]
+            fn $trait_fn(self, rhs: &$ty) -> Self::Output {
+                self.$trait_fn(Simd::splat(*rhs))
+            }
+        }
+
+        impl<'lhs, const N: usize> core::ops::$trait<$ty> for &'lhs Simd<$ty, N>
+        where
+            LaneCount<N>: SupportedLaneCount,
+        {
+            type Output = Simd<$ty, N>;
+            #[inline]
+            fn $trait_fn(self, rhs: $ty) -> Self::Output {
+                self.$trait_fn(Simd::splat(rhs))
+            }
+        }
+
+        impl<'lhs, const N: usize> core::ops::$trait<&$ty> for &'lhs Simd<$ty, N>
+        where
+            LaneCount<N>: SupportedLaneCount,
+        {
+            type Output = Simd<$ty, N>;
+            #[inline]
+            fn $trait_fn(self, rhs: &$ty) -> Self::Output {
+                self.$trait_fn(Simd::splat(*rhs))
+            }
+        }
+    };
+    { $($ty:ty),* } => {
+        $(
+        impl_splatted_shifts! { impl Shl::shl for $ty }
+        impl_splatted_shifts! { impl Shr::shr for $ty }
+        )*
+    }
+}
+
+// In the past there were inference issues when generically splatting arguments.
+// Enumerate them instead.
+impl_splatted_shifts! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize }
diff --git a/crates/core_simd/src/ops/unary.rs b/crates/core_simd/src/ops/unary.rs
index 4ad02215034be..a651aa73e9524 100644
--- a/crates/core_simd/src/ops/unary.rs
+++ b/crates/core_simd/src/ops/unary.rs
@@ -3,11 +3,11 @@ use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::ops::{Neg, Not}; // unary ops
 
 macro_rules! neg {
-    ($(impl<const LANES: usize> Neg for Simd<$scalar:ty, LANES>)*) => {
-        $(impl<const LANES: usize> Neg for Simd<$scalar, LANES>
+    ($(impl<const N: usize> Neg for Simd<$scalar:ty, N>)*) => {
+        $(impl<const N: usize> Neg for Simd<$scalar, N>
         where
             $scalar: SimdElement,
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             type Output = Self;
 
@@ -22,27 +22,27 @@ macro_rules! neg {
 }
 
 neg! {
-    impl<const LANES: usize> Neg for Simd<f32, LANES>
+    impl<const N: usize> Neg for Simd<f32, N>
 
-    impl<const LANES: usize> Neg for Simd<f64, LANES>
+    impl<const N: usize> Neg for Simd<f64, N>
 
-    impl<const LANES: usize> Neg for Simd<i8, LANES>
+    impl<const N: usize> Neg for Simd<i8, N>
 
-    impl<const LANES: usize> Neg for Simd<i16, LANES>
+    impl<const N: usize> Neg for Simd<i16, N>
 
-    impl<const LANES: usize> Neg for Simd<i32, LANES>
+    impl<const N: usize> Neg for Simd<i32, N>
 
-    impl<const LANES: usize> Neg for Simd<i64, LANES>
+    impl<const N: usize> Neg for Simd<i64, N>
 
-    impl<const LANES: usize> Neg for Simd<isize, LANES>
+    impl<const N: usize> Neg for Simd<isize, N>
 }
 
 macro_rules! not {
-    ($(impl<const LANES: usize> Not for Simd<$scalar:ty, LANES>)*) => {
-        $(impl<const LANES: usize> Not for Simd<$scalar, LANES>
+    ($(impl<const N: usize> Not for Simd<$scalar:ty, N>)*) => {
+        $(impl<const N: usize> Not for Simd<$scalar, N>
         where
             $scalar: SimdElement,
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             type Output = Self;
 
@@ -56,23 +56,23 @@ macro_rules! not {
 }
 
 not! {
-    impl<const LANES: usize> Not for Simd<i8, LANES>
+    impl<const N: usize> Not for Simd<i8, N>
 
-    impl<const LANES: usize> Not for Simd<i16, LANES>
+    impl<const N: usize> Not for Simd<i16, N>
 
-    impl<const LANES: usize> Not for Simd<i32, LANES>
+    impl<const N: usize> Not for Simd<i32, N>
 
-    impl<const LANES: usize> Not for Simd<i64, LANES>
+    impl<const N: usize> Not for Simd<i64, N>
 
-    impl<const LANES: usize> Not for Simd<isize, LANES>
+    impl<const N: usize> Not for Simd<isize, N>
 
-    impl<const LANES: usize> Not for Simd<u8, LANES>
+    impl<const N: usize> Not for Simd<u8, N>
 
-    impl<const LANES: usize> Not for Simd<u16, LANES>
+    impl<const N: usize> Not for Simd<u16, N>
 
-    impl<const LANES: usize> Not for Simd<u32, LANES>
+    impl<const N: usize> Not for Simd<u32, N>
 
-    impl<const LANES: usize> Not for Simd<u64, LANES>
+    impl<const N: usize> Not for Simd<u64, N>
 
-    impl<const LANES: usize> Not for Simd<usize, LANES>
+    impl<const N: usize> Not for Simd<usize, N>
 }
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index 065c5987d3fc9..cdcf8eeec815a 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -1,15 +1,15 @@
 use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Mask, MaskElement, Simd, SimdElement, SupportedLaneCount};
 
-impl<T, const LANES: usize> Mask<T, LANES>
+impl<T, const N: usize> Mask<T, N>
 where
     T: MaskElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    /// Choose lanes from two vectors.
+    /// Choose elements from two vectors.
     ///
-    /// For each lane in the mask, choose the corresponding lane from `true_values` if
-    /// that lane mask is true, and `false_values` if that lane mask is false.
+    /// For each element in the mask, choose the corresponding element from `true_values` if
+    /// that element mask is true, and `false_values` if that element mask is false.
     ///
     /// # Examples
     /// ```
@@ -23,11 +23,7 @@ where
     /// ```
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn select<U>(
-        self,
-        true_values: Simd<U, LANES>,
-        false_values: Simd<U, LANES>,
-    ) -> Simd<U, LANES>
+    pub fn select<U>(self, true_values: Simd<U, N>, false_values: Simd<U, N>) -> Simd<U, N>
     where
         U: SimdElement<Mask = T>,
     {
@@ -36,10 +32,10 @@ where
         unsafe { intrinsics::simd_select(self.to_int(), true_values, false_values) }
     }
 
-    /// Choose lanes from two masks.
+    /// Choose elements from two masks.
     ///
-    /// For each lane in the mask, choose the corresponding lane from `true_values` if
-    /// that lane mask is true, and `false_values` if that lane mask is false.
+    /// For each element in the mask, choose the corresponding element from `true_values` if
+    /// that element mask is true, and `false_values` if that element mask is false.
     ///
     /// # Examples
     /// ```
diff --git a/crates/core_simd/src/simd/cmp.rs b/crates/core_simd/src/simd/cmp.rs
new file mode 100644
index 0000000000000..a8d81dbf20f16
--- /dev/null
+++ b/crates/core_simd/src/simd/cmp.rs
@@ -0,0 +1,7 @@
+//! Traits for comparing and ordering vectors.
+
+mod eq;
+mod ord;
+
+pub use eq::*;
+pub use ord::*;
diff --git a/crates/core_simd/src/eq.rs b/crates/core_simd/src/simd/cmp/eq.rs
similarity index 74%
rename from crates/core_simd/src/eq.rs
rename to crates/core_simd/src/simd/cmp/eq.rs
index 80763c0727278..f132fa2cc0cae 100644
--- a/crates/core_simd/src/eq.rs
+++ b/crates/core_simd/src/simd/cmp/eq.rs
@@ -1,5 +1,7 @@
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdConstPtr, SimdElement, SimdMutPtr, SupportedLaneCount,
+    intrinsics,
+    ptr::{SimdConstPtr, SimdMutPtr},
+    LaneCount, Mask, Simd, SimdElement, SupportedLaneCount,
 };
 
 /// Parallel `PartialEq`.
@@ -7,11 +9,11 @@ pub trait SimdPartialEq {
     /// The mask type returned by each comparison.
     type Mask;
 
-    /// Test if each lane is equal to the corresponding lane in `other`.
+    /// Test if each element is equal to the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_eq(self, other: Self) -> Self::Mask;
 
-    /// Test if each lane is equal to the corresponding lane in `other`.
+    /// Test if each element is equal to the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_ne(self, other: Self) -> Self::Mask;
 }
@@ -19,11 +21,11 @@ pub trait SimdPartialEq {
 macro_rules! impl_number {
     { $($number:ty),* } => {
         $(
-        impl<const LANES: usize> SimdPartialEq for Simd<$number, LANES>
+        impl<const N: usize> SimdPartialEq for Simd<$number, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
-            type Mask = Mask<<$number as SimdElement>::Mask, LANES>;
+            type Mask = Mask<<$number as SimdElement>::Mask, N>;
 
             #[inline]
             fn simd_eq(self, other: Self) -> Self::Mask {
@@ -48,9 +50,9 @@ impl_number! { f32, f64, u8, u16, u32, u64, usize, i8, i16, i32, i64, isize }
 macro_rules! impl_mask {
     { $($integer:ty),* } => {
         $(
-        impl<const LANES: usize> SimdPartialEq for Mask<$integer, LANES>
+        impl<const N: usize> SimdPartialEq for Mask<$integer, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             type Mask = Self;
 
@@ -74,11 +76,11 @@ macro_rules! impl_mask {
 
 impl_mask! { i8, i16, i32, i64, isize }
 
-impl<T, const LANES: usize> SimdPartialEq for Simd<*const T, LANES>
+impl<T, const N: usize> SimdPartialEq for Simd<*const T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Mask = Mask<isize, LANES>;
+    type Mask = Mask<isize, N>;
 
     #[inline]
     fn simd_eq(self, other: Self) -> Self::Mask {
@@ -91,11 +93,11 @@ where
     }
 }
 
-impl<T, const LANES: usize> SimdPartialEq for Simd<*mut T, LANES>
+impl<T, const N: usize> SimdPartialEq for Simd<*mut T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Mask = Mask<isize, LANES>;
+    type Mask = Mask<isize, N>;
 
     #[inline]
     fn simd_eq(self, other: Self) -> Self::Mask {
diff --git a/crates/core_simd/src/ord.rs b/crates/core_simd/src/simd/cmp/ord.rs
similarity index 79%
rename from crates/core_simd/src/ord.rs
rename to crates/core_simd/src/simd/cmp/ord.rs
index b2455190e8231..4e9d49ea2211a 100644
--- a/crates/core_simd/src/ord.rs
+++ b/crates/core_simd/src/simd/cmp/ord.rs
@@ -1,44 +1,47 @@
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdConstPtr, SimdMutPtr, SimdPartialEq, SupportedLaneCount,
+    cmp::SimdPartialEq,
+    intrinsics,
+    ptr::{SimdConstPtr, SimdMutPtr},
+    LaneCount, Mask, Simd, SupportedLaneCount,
 };
 
 /// Parallel `PartialOrd`.
 pub trait SimdPartialOrd: SimdPartialEq {
-    /// Test if each lane is less than the corresponding lane in `other`.
+    /// Test if each element is less than the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_lt(self, other: Self) -> Self::Mask;
 
-    /// Test if each lane is less than or equal to the corresponding lane in `other`.
+    /// Test if each element is less than or equal to the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_le(self, other: Self) -> Self::Mask;
 
-    /// Test if each lane is greater than the corresponding lane in `other`.
+    /// Test if each element is greater than the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_gt(self, other: Self) -> Self::Mask;
 
-    /// Test if each lane is greater than or equal to the corresponding lane in `other`.
+    /// Test if each element is greater than or equal to the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_ge(self, other: Self) -> Self::Mask;
 }
 
 /// Parallel `Ord`.
 pub trait SimdOrd: SimdPartialOrd {
-    /// Returns the lane-wise maximum with `other`.
+    /// Returns the element-wise maximum with `other`.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn simd_max(self, other: Self) -> Self;
 
-    /// Returns the lane-wise minimum with `other`.
+    /// Returns the element-wise minimum with `other`.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn simd_min(self, other: Self) -> Self;
 
-    /// Restrict each lane to a certain interval.
+    /// Restrict each element to a certain interval.
     ///
-    /// For each lane, returns `max` if `self` is greater than `max`, and `min` if `self` is
+    /// For each element, returns `max` if `self` is greater than `max`, and `min` if `self` is
     /// less than `min`. Otherwise returns `self`.
     ///
     /// # Panics
     ///
-    /// Panics if `min > max` on any lane.
+    /// Panics if `min > max` on any element.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn simd_clamp(self, min: Self, max: Self) -> Self;
 }
@@ -46,9 +49,9 @@ pub trait SimdOrd: SimdPartialOrd {
 macro_rules! impl_integer {
     { $($integer:ty),* } => {
         $(
-        impl<const LANES: usize> SimdPartialOrd for Simd<$integer, LANES>
+        impl<const N: usize> SimdPartialOrd for Simd<$integer, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn simd_lt(self, other: Self) -> Self::Mask {
@@ -79,9 +82,9 @@ macro_rules! impl_integer {
             }
         }
 
-        impl<const LANES: usize> SimdOrd for Simd<$integer, LANES>
+        impl<const N: usize> SimdOrd for Simd<$integer, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn simd_max(self, other: Self) -> Self {
@@ -98,7 +101,7 @@ macro_rules! impl_integer {
             fn simd_clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.simd_le(max).all(),
-                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                    "each element in `min` must be less than or equal to the corresponding element in `max`",
                 );
                 self.simd_max(min).simd_min(max)
             }
@@ -112,9 +115,9 @@ impl_integer! { u8, u16, u32, u64, usize, i8, i16, i32, i64, isize }
 macro_rules! impl_float {
     { $($float:ty),* } => {
         $(
-        impl<const LANES: usize> SimdPartialOrd for Simd<$float, LANES>
+        impl<const N: usize> SimdPartialOrd for Simd<$float, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn simd_lt(self, other: Self) -> Self::Mask {
@@ -153,9 +156,9 @@ impl_float! { f32, f64 }
 macro_rules! impl_mask {
     { $($integer:ty),* } => {
         $(
-        impl<const LANES: usize> SimdPartialOrd for Mask<$integer, LANES>
+        impl<const N: usize> SimdPartialOrd for Mask<$integer, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn simd_lt(self, other: Self) -> Self::Mask {
@@ -186,9 +189,9 @@ macro_rules! impl_mask {
             }
         }
 
-        impl<const LANES: usize> SimdOrd for Mask<$integer, LANES>
+        impl<const N: usize> SimdOrd for Mask<$integer, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             #[inline]
             fn simd_max(self, other: Self) -> Self {
@@ -205,7 +208,7 @@ macro_rules! impl_mask {
             fn simd_clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.simd_le(max).all(),
-                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                    "each element in `min` must be less than or equal to the corresponding element in `max`",
                 );
                 self.simd_max(min).simd_min(max)
             }
@@ -216,9 +219,9 @@ macro_rules! impl_mask {
 
 impl_mask! { i8, i16, i32, i64, isize }
 
-impl<T, const LANES: usize> SimdPartialOrd for Simd<*const T, LANES>
+impl<T, const N: usize> SimdPartialOrd for Simd<*const T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn simd_lt(self, other: Self) -> Self::Mask {
@@ -241,9 +244,9 @@ where
     }
 }
 
-impl<T, const LANES: usize> SimdOrd for Simd<*const T, LANES>
+impl<T, const N: usize> SimdOrd for Simd<*const T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn simd_max(self, other: Self) -> Self {
@@ -260,15 +263,15 @@ where
     fn simd_clamp(self, min: Self, max: Self) -> Self {
         assert!(
             min.simd_le(max).all(),
-            "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+            "each element in `min` must be less than or equal to the corresponding element in `max`",
         );
         self.simd_max(min).simd_min(max)
     }
 }
 
-impl<T, const LANES: usize> SimdPartialOrd for Simd<*mut T, LANES>
+impl<T, const N: usize> SimdPartialOrd for Simd<*mut T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn simd_lt(self, other: Self) -> Self::Mask {
@@ -291,9 +294,9 @@ where
     }
 }
 
-impl<T, const LANES: usize> SimdOrd for Simd<*mut T, LANES>
+impl<T, const N: usize> SimdOrd for Simd<*mut T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
     #[inline]
     fn simd_max(self, other: Self) -> Self {
@@ -310,7 +313,7 @@ where
     fn simd_clamp(self, min: Self, max: Self) -> Self {
         assert!(
             min.simd_le(max).all(),
-            "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+            "each element in `min` must be less than or equal to the corresponding element in `max`",
         );
         self.simd_max(min).simd_min(max)
     }
diff --git a/crates/core_simd/src/elements.rs b/crates/core_simd/src/simd/num.rs
similarity index 63%
rename from crates/core_simd/src/elements.rs
rename to crates/core_simd/src/simd/num.rs
index dc7f52a4d576c..22a4802ec6cb5 100644
--- a/crates/core_simd/src/elements.rs
+++ b/crates/core_simd/src/simd/num.rs
@@ -1,15 +1,13 @@
-mod const_ptr;
+//! Traits for vectors with numeric elements.
+
 mod float;
 mod int;
-mod mut_ptr;
 mod uint;
 
 mod sealed {
     pub trait Sealed {}
 }
 
-pub use const_ptr::*;
 pub use float::*;
 pub use int::*;
-pub use mut_ptr::*;
 pub use uint::*;
diff --git a/crates/core_simd/src/elements/float.rs b/crates/core_simd/src/simd/num/float.rs
similarity index 80%
rename from crates/core_simd/src/elements/float.rs
rename to crates/core_simd/src/simd/num/float.rs
index 501c1c5ddd3f2..fc0b99e87a68e 100644
--- a/crates/core_simd/src/elements/float.rs
+++ b/crates/core_simd/src/simd/num/float.rs
@@ -1,7 +1,7 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SimdPartialEq, SimdPartialOrd,
-    SupportedLaneCount,
+    cmp::{SimdPartialEq, SimdPartialOrd},
+    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SupportedLaneCount,
 };
 
 /// Operations on SIMD vectors of floats.
@@ -28,7 +28,7 @@ pub trait SimdFloat: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{SimdFloat, SimdInt, Simd};
+    /// # use simd::prelude::*;
     /// let floats: Simd<f32, 4> = Simd::from_array([1.9, -4.5, f32::INFINITY, f32::NAN]);
     /// let ints = floats.cast::<i32>();
     /// assert_eq!(ints, Simd::from_array([1, -4, i32::MAX, 0]));
@@ -63,64 +63,64 @@ pub trait SimdFloat: Copy + Sealed {
         Self::Scalar: core::convert::FloatToInt<I>;
 
     /// Raw transmutation to an unsigned integer vector type with the
-    /// same size and number of lanes.
+    /// same size and number of elements.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn to_bits(self) -> Self::Bits;
 
     /// Raw transmutation from an unsigned integer vector type with the
-    /// same size and number of lanes.
+    /// same size and number of elements.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn from_bits(bits: Self::Bits) -> Self;
 
-    /// Produces a vector where every lane has the absolute value of the
-    /// equivalently-indexed lane in `self`.
+    /// Produces a vector where every element has the absolute value of the
+    /// equivalently-indexed element in `self`.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn abs(self) -> Self;
 
-    /// Takes the reciprocal (inverse) of each lane, `1/x`.
+    /// Takes the reciprocal (inverse) of each element, `1/x`.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn recip(self) -> Self;
 
-    /// Converts each lane from radians to degrees.
+    /// Converts each element from radians to degrees.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn to_degrees(self) -> Self;
 
-    /// Converts each lane from degrees to radians.
+    /// Converts each element from degrees to radians.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn to_radians(self) -> Self;
 
-    /// Returns true for each lane if it has a positive sign, including
+    /// Returns true for each element if it has a positive sign, including
     /// `+0.0`, `NaN`s with positive sign bit and positive infinity.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn is_sign_positive(self) -> Self::Mask;
 
-    /// Returns true for each lane if it has a negative sign, including
+    /// Returns true for each element if it has a negative sign, including
     /// `-0.0`, `NaN`s with negative sign bit and negative infinity.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn is_sign_negative(self) -> Self::Mask;
 
-    /// Returns true for each lane if its value is `NaN`.
+    /// Returns true for each element if its value is `NaN`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn is_nan(self) -> Self::Mask;
 
-    /// Returns true for each lane if its value is positive infinity or negative infinity.
+    /// Returns true for each element if its value is positive infinity or negative infinity.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn is_infinite(self) -> Self::Mask;
 
-    /// Returns true for each lane if its value is neither infinite nor `NaN`.
+    /// Returns true for each element if its value is neither infinite nor `NaN`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn is_finite(self) -> Self::Mask;
 
-    /// Returns true for each lane if its value is subnormal.
+    /// Returns true for each element if its value is subnormal.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn is_subnormal(self) -> Self::Mask;
 
-    /// Returns true for each lane if its value is neither zero, infinite,
+    /// Returns true for each element if its value is neither zero, infinite,
     /// subnormal, nor `NaN`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn is_normal(self) -> Self::Mask;
 
-    /// Replaces each lane with a number that represents its sign.
+    /// Replaces each element with a number that represents its sign.
     ///
     /// * `1.0` if the number is positive, `+0.0`, or `INFINITY`
     /// * `-1.0` if the number is negative, `-0.0`, or `NEG_INFINITY`
@@ -128,33 +128,33 @@ pub trait SimdFloat: Copy + Sealed {
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn signum(self) -> Self;
 
-    /// Returns each lane with the magnitude of `self` and the sign of `sign`.
+    /// Returns each element with the magnitude of `self` and the sign of `sign`.
     ///
-    /// For any lane containing a `NAN`, a `NAN` with the sign of `sign` is returned.
+    /// For any element containing a `NAN`, a `NAN` with the sign of `sign` is returned.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn copysign(self, sign: Self) -> Self;
 
-    /// Returns the minimum of each lane.
+    /// Returns the minimum of each element.
     ///
     /// If one of the values is `NAN`, then the other value is returned.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn simd_min(self, other: Self) -> Self;
 
-    /// Returns the maximum of each lane.
+    /// Returns the maximum of each element.
     ///
     /// If one of the values is `NAN`, then the other value is returned.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn simd_max(self, other: Self) -> Self;
 
-    /// Restrict each lane to a certain interval unless it is NaN.
+    /// Restrict each element to a certain interval unless it is NaN.
     ///
-    /// For each lane in `self`, returns the corresponding lane in `max` if the lane is
-    /// greater than `max`, and the corresponding lane in `min` if the lane is less
-    /// than `min`.  Otherwise returns the lane in `self`.
+    /// For each element in `self`, returns the corresponding element in `max` if the element is
+    /// greater than `max`, and the corresponding element in `min` if the element is less
+    /// than `min`.  Otherwise returns the element in `self`.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn simd_clamp(self, min: Self, max: Self) -> Self;
 
-    /// Returns the sum of the lanes of the vector.
+    /// Returns the sum of the elements of the vector.
     ///
     /// # Examples
     ///
@@ -162,13 +162,13 @@ pub trait SimdFloat: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{f32x2, SimdFloat};
+    /// # use simd::prelude::*;
     /// let v = f32x2::from_array([1., 2.]);
     /// assert_eq!(v.reduce_sum(), 3.);
     /// ```
     fn reduce_sum(self) -> Self::Scalar;
 
-    /// Reducing multiply.  Returns the product of the lanes of the vector.
+    /// Reducing multiply.  Returns the product of the elements of the vector.
     ///
     /// # Examples
     ///
@@ -176,18 +176,18 @@ pub trait SimdFloat: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{f32x2, SimdFloat};
+    /// # use simd::prelude::*;
     /// let v = f32x2::from_array([3., 4.]);
     /// assert_eq!(v.reduce_product(), 12.);
     /// ```
     fn reduce_product(self) -> Self::Scalar;
 
-    /// Returns the maximum lane in the vector.
+    /// Returns the maximum element in the vector.
     ///
     /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
     /// return either.
     ///
-    /// This function will not return `NaN` unless all lanes are `NaN`.
+    /// This function will not return `NaN` unless all elements are `NaN`.
     ///
     /// # Examples
     ///
@@ -195,7 +195,7 @@ pub trait SimdFloat: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{f32x2, SimdFloat};
+    /// # use simd::prelude::*;
     /// let v = f32x2::from_array([1., 2.]);
     /// assert_eq!(v.reduce_max(), 2.);
     ///
@@ -209,12 +209,12 @@ pub trait SimdFloat: Copy + Sealed {
     /// ```
     fn reduce_max(self) -> Self::Scalar;
 
-    /// Returns the minimum lane in the vector.
+    /// Returns the minimum element in the vector.
     ///
     /// Returns values based on equality, so a vector containing both `0.` and `-0.` may
     /// return either.
     ///
-    /// This function will not return `NaN` unless all lanes are `NaN`.
+    /// This function will not return `NaN` unless all elements are `NaN`.
     ///
     /// # Examples
     ///
@@ -222,7 +222,7 @@ pub trait SimdFloat: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{f32x2, SimdFloat};
+    /// # use simd::prelude::*;
     /// let v = f32x2::from_array([3., 7.]);
     /// assert_eq!(v.reduce_min(), 3.);
     ///
@@ -240,20 +240,20 @@ pub trait SimdFloat: Copy + Sealed {
 macro_rules! impl_trait {
     { $($ty:ty { bits: $bits_ty:ty, mask: $mask_ty:ty }),* } => {
         $(
-        impl<const LANES: usize> Sealed for Simd<$ty, LANES>
+        impl<const N: usize> Sealed for Simd<$ty, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
         }
 
-        impl<const LANES: usize> SimdFloat for Simd<$ty, LANES>
+        impl<const N: usize> SimdFloat for Simd<$ty, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
-            type Mask = Mask<<$mask_ty as SimdElement>::Mask, LANES>;
+            type Mask = Mask<<$mask_ty as SimdElement>::Mask, N>;
             type Scalar = $ty;
-            type Bits = Simd<$bits_ty, LANES>;
-            type Cast<T: SimdElement> = Simd<T, LANES>;
+            type Bits = Simd<$bits_ty, N>;
+            type Cast<T: SimdElement> = Simd<T, N>;
 
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T>
@@ -273,14 +273,14 @@ macro_rules! impl_trait {
             }
 
             #[inline]
-            fn to_bits(self) -> Simd<$bits_ty, LANES> {
+            fn to_bits(self) -> Simd<$bits_ty, N> {
                 assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Self::Bits>());
                 // Safety: transmuting between vector types is safe
                 unsafe { core::mem::transmute_copy(&self) }
             }
 
             #[inline]
-            fn from_bits(bits: Simd<$bits_ty, LANES>) -> Self {
+            fn from_bits(bits: Simd<$bits_ty, N>) -> Self {
                 assert_eq!(core::mem::size_of::<Self>(), core::mem::size_of::<Self::Bits>());
                 // Safety: transmuting between vector types is safe
                 unsafe { core::mem::transmute_copy(&bits) }
@@ -336,7 +336,10 @@ macro_rules! impl_trait {
 
             #[inline]
             fn is_subnormal(self) -> Self::Mask {
-                self.abs().simd_ne(Self::splat(0.0)) & (self.to_bits() & Self::splat(Self::Scalar::INFINITY).to_bits()).simd_eq(Simd::splat(0))
+                // On some architectures (e.g. armv7 and some ppc) subnormals are flushed to zero,
+                // so this comparison must be done with integers.
+                let not_zero = self.abs().to_bits().simd_ne(Self::splat(0.0).to_bits());
+                not_zero & (self.to_bits() & Self::splat(Self::Scalar::INFINITY).to_bits()).simd_eq(Simd::splat(0))
             }
 
             #[inline]
@@ -373,7 +376,7 @@ macro_rules! impl_trait {
             fn simd_clamp(self, min: Self, max: Self) -> Self {
                 assert!(
                     min.simd_le(max).all(),
-                    "each lane in `min` must be less than or equal to the corresponding lane in `max`",
+                    "each element in `min` must be less than or equal to the corresponding element in `max`",
                 );
                 let mut x = self;
                 x = x.simd_lt(min).select(min, x);
diff --git a/crates/core_simd/src/elements/int.rs b/crates/core_simd/src/simd/num/int.rs
similarity index 71%
rename from crates/core_simd/src/elements/int.rs
rename to crates/core_simd/src/simd/num/int.rs
index 6db89ff9a659e..1f1aa27278299 100644
--- a/crates/core_simd/src/elements/int.rs
+++ b/crates/core_simd/src/simd/num/int.rs
@@ -1,6 +1,7 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SimdPartialOrd, SupportedLaneCount,
+    cmp::SimdPartialOrd, intrinsics, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
+    SupportedLaneCount,
 };
 
 /// Operations on SIMD vectors of signed integers.
@@ -11,6 +12,9 @@ pub trait SimdInt: Copy + Sealed {
     /// Scalar type contained by this SIMD vector type.
     type Scalar;
 
+    /// A SIMD vector of unsigned integers with the same element size.
+    type Unsigned;
+
     /// A SIMD vector with a different element type.
     type Cast<T: SimdElement>;
 
@@ -28,7 +32,7 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdInt};
+    /// # use simd::prelude::*;
     /// use core::i32::{MIN, MAX};
     /// let x = Simd::from_array([MIN, 0, 1, MAX]);
     /// let max = Simd::splat(MAX);
@@ -46,7 +50,7 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdInt};
+    /// # use simd::prelude::*;
     /// use core::i32::{MIN, MAX};
     /// let x = Simd::from_array([MIN, -2, -1, MAX]);
     /// let max = Simd::splat(MAX);
@@ -57,14 +61,14 @@ pub trait SimdInt: Copy + Sealed {
     fn saturating_sub(self, second: Self) -> Self;
 
     /// Lanewise absolute value, implemented in Rust.
-    /// Every lane becomes its absolute value.
+    /// Every element becomes its absolute value.
     ///
     /// # Examples
     /// ```
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdInt};
+    /// # use simd::prelude::*;
     /// use core::i32::{MIN, MAX};
     /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
     /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
@@ -79,7 +83,7 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdInt};
+    /// # use simd::prelude::*;
     /// use core::i32::{MIN, MAX};
     /// let xs = Simd::from_array([MIN, -2, 0, 3]);
     /// let unsat = xs.abs();
@@ -97,7 +101,7 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdInt};
+    /// # use simd::prelude::*;
     /// use core::i32::{MIN, MAX};
     /// let x = Simd::from_array([MIN, -2, 3, MAX]);
     /// let unsat = -x;
@@ -107,19 +111,19 @@ pub trait SimdInt: Copy + Sealed {
     /// ```
     fn saturating_neg(self) -> Self;
 
-    /// Returns true for each positive lane and false if it is zero or negative.
+    /// Returns true for each positive element and false if it is zero or negative.
     fn is_positive(self) -> Self::Mask;
 
-    /// Returns true for each negative lane and false if it is zero or positive.
+    /// Returns true for each negative element and false if it is zero or positive.
     fn is_negative(self) -> Self::Mask;
 
-    /// Returns numbers representing the sign of each lane.
+    /// Returns numbers representing the sign of each element.
     /// * `0` if the number is zero
     /// * `1` if the number is positive
     /// * `-1` if the number is negative
     fn signum(self) -> Self;
 
-    /// Returns the sum of the lanes of the vector, with wrapping addition.
+    /// Returns the sum of the elements of the vector, with wrapping addition.
     ///
     /// # Examples
     ///
@@ -127,7 +131,7 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{i32x4, SimdInt};
+    /// # use simd::prelude::*;
     /// let v = i32x4::from_array([1, 2, 3, 4]);
     /// assert_eq!(v.reduce_sum(), 10);
     ///
@@ -137,7 +141,7 @@ pub trait SimdInt: Copy + Sealed {
     /// ```
     fn reduce_sum(self) -> Self::Scalar;
 
-    /// Returns the product of the lanes of the vector, with wrapping multiplication.
+    /// Returns the product of the elements of the vector, with wrapping multiplication.
     ///
     /// # Examples
     ///
@@ -145,7 +149,7 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{i32x4, SimdInt};
+    /// # use simd::prelude::*;
     /// let v = i32x4::from_array([1, 2, 3, 4]);
     /// assert_eq!(v.reduce_product(), 24);
     ///
@@ -155,7 +159,7 @@ pub trait SimdInt: Copy + Sealed {
     /// ```
     fn reduce_product(self) -> Self::Scalar;
 
-    /// Returns the maximum lane in the vector.
+    /// Returns the maximum element in the vector.
     ///
     /// # Examples
     ///
@@ -163,13 +167,13 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{i32x4, SimdInt};
+    /// # use simd::prelude::*;
     /// let v = i32x4::from_array([1, 2, 3, 4]);
     /// assert_eq!(v.reduce_max(), 4);
     /// ```
     fn reduce_max(self) -> Self::Scalar;
 
-    /// Returns the minimum lane in the vector.
+    /// Returns the minimum element in the vector.
     ///
     /// # Examples
     ///
@@ -177,38 +181,58 @@ pub trait SimdInt: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{i32x4, SimdInt};
+    /// # use simd::prelude::*;
     /// let v = i32x4::from_array([1, 2, 3, 4]);
     /// assert_eq!(v.reduce_min(), 1);
     /// ```
     fn reduce_min(self) -> Self::Scalar;
 
-    /// Returns the cumulative bitwise "and" across the lanes of the vector.
+    /// Returns the cumulative bitwise "and" across the elements of the vector.
     fn reduce_and(self) -> Self::Scalar;
 
-    /// Returns the cumulative bitwise "or" across the lanes of the vector.
+    /// Returns the cumulative bitwise "or" across the elements of the vector.
     fn reduce_or(self) -> Self::Scalar;
 
-    /// Returns the cumulative bitwise "xor" across the lanes of the vector.
+    /// Returns the cumulative bitwise "xor" across the elements of the vector.
     fn reduce_xor(self) -> Self::Scalar;
+
+    /// Reverses the byte order of each element.
+    fn swap_bytes(self) -> Self;
+
+    /// Reverses the order of bits in each elemnent.
+    /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
+    fn reverse_bits(self) -> Self;
+
+    /// Returns the number of leading zeros in the binary representation of each element.
+    fn leading_zeros(self) -> Self::Unsigned;
+
+    /// Returns the number of trailing zeros in the binary representation of each element.
+    fn trailing_zeros(self) -> Self::Unsigned;
+
+    /// Returns the number of leading ones in the binary representation of each element.
+    fn leading_ones(self) -> Self::Unsigned;
+
+    /// Returns the number of trailing ones in the binary representation of each element.
+    fn trailing_ones(self) -> Self::Unsigned;
 }
 
 macro_rules! impl_trait {
-    { $($ty:ty),* } => {
+    { $($ty:ident ($unsigned:ident)),* } => {
         $(
-        impl<const LANES: usize> Sealed for Simd<$ty, LANES>
+        impl<const N: usize> Sealed for Simd<$ty, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
         }
 
-        impl<const LANES: usize> SimdInt for Simd<$ty, LANES>
+        impl<const N: usize> SimdInt for Simd<$ty, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
-            type Mask = Mask<<$ty as SimdElement>::Mask, LANES>;
+            type Mask = Mask<<$ty as SimdElement>::Mask, N>;
             type Scalar = $ty;
-            type Cast<T: SimdElement> = Simd<T, LANES>;
+            type Unsigned = Simd<$unsigned, N>;
+            type Cast<T: SimdElement> = Simd<T, N>;
 
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T> {
@@ -307,9 +331,41 @@ macro_rules! impl_trait {
                 // Safety: `self` is an integer vector
                 unsafe { intrinsics::simd_reduce_xor(self) }
             }
+
+            #[inline]
+            fn swap_bytes(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_bswap(self) }
+            }
+
+            #[inline]
+            fn reverse_bits(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_bitreverse(self) }
+            }
+
+            #[inline]
+            fn leading_zeros(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().leading_zeros()
+            }
+
+            #[inline]
+            fn trailing_zeros(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().trailing_zeros()
+            }
+
+            #[inline]
+            fn leading_ones(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().leading_ones()
+            }
+
+            #[inline]
+            fn trailing_ones(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().trailing_ones()
+            }
         }
         )*
     }
 }
 
-impl_trait! { i8, i16, i32, i64, isize }
+impl_trait! { i8 (u8), i16 (u16), i32 (u32), i64 (u64), isize (usize) }
diff --git a/crates/core_simd/src/elements/uint.rs b/crates/core_simd/src/simd/num/uint.rs
similarity index 58%
rename from crates/core_simd/src/elements/uint.rs
rename to crates/core_simd/src/simd/num/uint.rs
index 3926c395ec9ac..c955ee8fe8bd1 100644
--- a/crates/core_simd/src/elements/uint.rs
+++ b/crates/core_simd/src/simd/num/uint.rs
@@ -16,6 +16,12 @@ pub trait SimdUint: Copy + Sealed {
     #[must_use]
     fn cast<T: SimdCast>(self) -> Self::Cast<T>;
 
+    /// Wrapping negation.
+    ///
+    /// Like [`u32::wrapping_neg`], all applications of this function will wrap, with the exception
+    /// of `-0`.
+    fn wrapping_neg(self) -> Self;
+
     /// Lanewise saturating add.
     ///
     /// # Examples
@@ -23,7 +29,7 @@ pub trait SimdUint: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdUint};
+    /// # use simd::prelude::*;
     /// use core::u32::MAX;
     /// let x = Simd::from_array([2, 1, 0, MAX]);
     /// let max = Simd::splat(MAX);
@@ -41,7 +47,7 @@ pub trait SimdUint: Copy + Sealed {
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdUint};
+    /// # use simd::prelude::*;
     /// use core::u32::MAX;
     /// let x = Simd::from_array([2, 1, 0, MAX]);
     /// let max = Simd::splat(MAX);
@@ -51,43 +57,62 @@ pub trait SimdUint: Copy + Sealed {
     /// assert_eq!(sat, Simd::splat(0));
     fn saturating_sub(self, second: Self) -> Self;
 
-    /// Returns the sum of the lanes of the vector, with wrapping addition.
+    /// Returns the sum of the elements of the vector, with wrapping addition.
     fn reduce_sum(self) -> Self::Scalar;
 
-    /// Returns the product of the lanes of the vector, with wrapping multiplication.
+    /// Returns the product of the elements of the vector, with wrapping multiplication.
     fn reduce_product(self) -> Self::Scalar;
 
-    /// Returns the maximum lane in the vector.
+    /// Returns the maximum element in the vector.
     fn reduce_max(self) -> Self::Scalar;
 
-    /// Returns the minimum lane in the vector.
+    /// Returns the minimum element in the vector.
     fn reduce_min(self) -> Self::Scalar;
 
-    /// Returns the cumulative bitwise "and" across the lanes of the vector.
+    /// Returns the cumulative bitwise "and" across the elements of the vector.
     fn reduce_and(self) -> Self::Scalar;
 
-    /// Returns the cumulative bitwise "or" across the lanes of the vector.
+    /// Returns the cumulative bitwise "or" across the elements of the vector.
     fn reduce_or(self) -> Self::Scalar;
 
-    /// Returns the cumulative bitwise "xor" across the lanes of the vector.
+    /// Returns the cumulative bitwise "xor" across the elements of the vector.
     fn reduce_xor(self) -> Self::Scalar;
+
+    /// Reverses the byte order of each element.
+    fn swap_bytes(self) -> Self;
+
+    /// Reverses the order of bits in each elemnent.
+    /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
+    fn reverse_bits(self) -> Self;
+
+    /// Returns the number of leading zeros in the binary representation of each element.
+    fn leading_zeros(self) -> Self;
+
+    /// Returns the number of trailing zeros in the binary representation of each element.
+    fn trailing_zeros(self) -> Self;
+
+    /// Returns the number of leading ones in the binary representation of each element.
+    fn leading_ones(self) -> Self;
+
+    /// Returns the number of trailing ones in the binary representation of each element.
+    fn trailing_ones(self) -> Self;
 }
 
 macro_rules! impl_trait {
-    { $($ty:ty),* } => {
+    { $($ty:ident ($signed:ident)),* } => {
         $(
-        impl<const LANES: usize> Sealed for Simd<$ty, LANES>
+        impl<const N: usize> Sealed for Simd<$ty, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
         }
 
-        impl<const LANES: usize> SimdUint for Simd<$ty, LANES>
+        impl<const N: usize> SimdUint for Simd<$ty, N>
         where
-            LaneCount<LANES>: SupportedLaneCount,
+            LaneCount<N>: SupportedLaneCount,
         {
             type Scalar = $ty;
-            type Cast<T: SimdElement> = Simd<T, LANES>;
+            type Cast<T: SimdElement> = Simd<T, N>;
 
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T> {
@@ -95,6 +120,12 @@ macro_rules! impl_trait {
                 unsafe { intrinsics::simd_as(self) }
             }
 
+            #[inline]
+            fn wrapping_neg(self) -> Self {
+                use crate::simd::num::SimdInt;
+                (-self.cast::<$signed>()).cast()
+            }
+
             #[inline]
             fn saturating_add(self, second: Self) -> Self {
                 // Safety: `self` is a vector
@@ -148,9 +179,43 @@ macro_rules! impl_trait {
                 // Safety: `self` is an integer vector
                 unsafe { intrinsics::simd_reduce_xor(self) }
             }
+
+            #[inline]
+            fn swap_bytes(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_bswap(self) }
+            }
+
+            #[inline]
+            fn reverse_bits(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_bitreverse(self) }
+            }
+
+            #[inline]
+            fn leading_zeros(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_ctlz(self) }
+            }
+
+            #[inline]
+            fn trailing_zeros(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { intrinsics::simd_cttz(self) }
+            }
+
+            #[inline]
+            fn leading_ones(self) -> Self {
+                (!self).leading_zeros()
+            }
+
+            #[inline]
+            fn trailing_ones(self) -> Self {
+                (!self).trailing_zeros()
+            }
         }
         )*
     }
 }
 
-impl_trait! { u8, u16, u32, u64, usize }
+impl_trait! { u8 (i8), u16 (i16), u32 (i32), u64 (i64), usize (isize) }
diff --git a/crates/core_simd/src/simd/prelude.rs b/crates/core_simd/src/simd/prelude.rs
index e8fdc932d490f..4b7c744c01326 100644
--- a/crates/core_simd/src/simd/prelude.rs
+++ b/crates/core_simd/src/simd/prelude.rs
@@ -7,8 +7,10 @@
 
 #[doc(no_inline)]
 pub use super::{
-    simd_swizzle, Mask, Simd, SimdConstPtr, SimdFloat, SimdInt, SimdMutPtr, SimdOrd, SimdPartialEq,
-    SimdPartialOrd, SimdUint,
+    cmp::{SimdOrd, SimdPartialEq, SimdPartialOrd},
+    num::{SimdFloat, SimdInt, SimdUint},
+    ptr::{SimdConstPtr, SimdMutPtr},
+    simd_swizzle, Mask, Simd,
 };
 
 #[rustfmt::skip]
diff --git a/crates/core_simd/src/simd/ptr.rs b/crates/core_simd/src/simd/ptr.rs
new file mode 100644
index 0000000000000..3f8e666911853
--- /dev/null
+++ b/crates/core_simd/src/simd/ptr.rs
@@ -0,0 +1,11 @@
+//! Traits for vectors of pointers.
+
+mod const_ptr;
+mod mut_ptr;
+
+mod sealed {
+    pub trait Sealed {}
+}
+
+pub use const_ptr::*;
+pub use mut_ptr::*;
diff --git a/crates/core_simd/src/elements/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
similarity index 81%
rename from crates/core_simd/src/elements/const_ptr.rs
rename to crates/core_simd/src/simd/ptr/const_ptr.rs
index f215f9a61d02e..97fe3fb600dfa 100644
--- a/crates/core_simd/src/elements/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -1,15 +1,17 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SimdUint, SupportedLaneCount};
+use crate::simd::{
+    cmp::SimdPartialEq, intrinsics, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount,
+};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
-    /// Vector of `usize` with the same number of lanes.
+    /// Vector of `usize` with the same number of elements.
     type Usize;
 
-    /// Vector of `isize` with the same number of lanes.
+    /// Vector of `isize` with the same number of elements.
     type Isize;
 
-    /// Vector of const pointers with the same number of lanes.
+    /// Vector of const pointers with the same number of elements.
     type CastPtr<T>;
 
     /// Vector of mutable pointers to the same type.
@@ -18,17 +20,17 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Mask type used for manipulating this SIMD vector type.
     type Mask;
 
-    /// Returns `true` for each lane that is null.
+    /// Returns `true` for each element that is null.
     fn is_null(self) -> Self::Mask;
 
     /// Casts to a pointer of another type.
     ///
-    /// Equivalent to calling [`pointer::cast`] on each lane.
+    /// Equivalent to calling [`pointer::cast`] on each element.
     fn cast<T>(self) -> Self::CastPtr<T>;
 
     /// Changes constness without changing the type.
     ///
-    /// Equivalent to calling [`pointer::cast_mut`] on each lane.
+    /// Equivalent to calling [`pointer::cast_mut`] on each element.
     fn cast_mut(self) -> Self::MutPtr;
 
     /// Gets the "address" portion of the pointer.
@@ -39,7 +41,7 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// This method semantically discards *provenance* and
     /// *address-space* information. To properly restore that information, use [`Self::with_addr`].
     ///
-    /// Equivalent to calling [`pointer::addr`] on each lane.
+    /// Equivalent to calling [`pointer::addr`] on each element.
     fn addr(self) -> Self::Usize;
 
     /// Creates a new pointer with the given address.
@@ -47,7 +49,7 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// This performs the same operation as a cast, but copies the *address-space* and
     /// *provenance* of `self` to the new pointer.
     ///
-    /// Equivalent to calling [`pointer::with_addr`] on each lane.
+    /// Equivalent to calling [`pointer::with_addr`] on each element.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
@@ -56,39 +58,36 @@ pub trait SimdConstPtr: Copy + Sealed {
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`core::ptr::from_exposed_addr`] on each lane.
+    /// Equivalent to calling [`core::ptr::from_exposed_addr`] on each element.
     fn from_exposed_addr(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_offset`] on each element.
     fn wrapping_offset(self, offset: Self::Isize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_add`] on each element.
     fn wrapping_add(self, count: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_sub`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_sub`] on each element.
     fn wrapping_sub(self, count: Self::Usize) -> Self;
 }
 
-impl<T, const LANES: usize> Sealed for Simd<*const T, LANES> where
-    LaneCount<LANES>: SupportedLaneCount
-{
-}
+impl<T, const N: usize> Sealed for Simd<*const T, N> where LaneCount<N>: SupportedLaneCount {}
 
-impl<T, const LANES: usize> SimdConstPtr for Simd<*const T, LANES>
+impl<T, const N: usize> SimdConstPtr for Simd<*const T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Usize = Simd<usize, LANES>;
-    type Isize = Simd<isize, LANES>;
-    type CastPtr<U> = Simd<*const U, LANES>;
-    type MutPtr = Simd<*mut T, LANES>;
-    type Mask = Mask<isize, LANES>;
+    type Usize = Simd<usize, N>;
+    type Isize = Simd<isize, N>;
+    type CastPtr<U> = Simd<*const U, N>;
+    type MutPtr = Simd<*mut T, N>;
+    type Mask = Mask<isize, N>;
 
     #[inline]
     fn is_null(self) -> Self::Mask {
diff --git a/crates/core_simd/src/elements/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
similarity index 81%
rename from crates/core_simd/src/elements/mut_ptr.rs
rename to crates/core_simd/src/simd/ptr/mut_ptr.rs
index 4bdc6a14ce4a6..e35633d0433dd 100644
--- a/crates/core_simd/src/elements/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -1,15 +1,17 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Mask, Simd, SimdPartialEq, SimdUint, SupportedLaneCount};
+use crate::simd::{
+    cmp::SimdPartialEq, intrinsics, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount,
+};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
-    /// Vector of `usize` with the same number of lanes.
+    /// Vector of `usize` with the same number of elements.
     type Usize;
 
-    /// Vector of `isize` with the same number of lanes.
+    /// Vector of `isize` with the same number of elements.
     type Isize;
 
-    /// Vector of const pointers with the same number of lanes.
+    /// Vector of const pointers with the same number of elements.
     type CastPtr<T>;
 
     /// Vector of constant pointers to the same type.
@@ -18,17 +20,17 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Mask type used for manipulating this SIMD vector type.
     type Mask;
 
-    /// Returns `true` for each lane that is null.
+    /// Returns `true` for each element that is null.
     fn is_null(self) -> Self::Mask;
 
     /// Casts to a pointer of another type.
     ///
-    /// Equivalent to calling [`pointer::cast`] on each lane.
+    /// Equivalent to calling [`pointer::cast`] on each element.
     fn cast<T>(self) -> Self::CastPtr<T>;
 
     /// Changes constness without changing the type.
     ///
-    /// Equivalent to calling [`pointer::cast_const`] on each lane.
+    /// Equivalent to calling [`pointer::cast_const`] on each element.
     fn cast_const(self) -> Self::ConstPtr;
 
     /// Gets the "address" portion of the pointer.
@@ -36,7 +38,7 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// This method discards pointer semantic metadata, so the result cannot be
     /// directly cast into a valid pointer.
     ///
-    /// Equivalent to calling [`pointer::addr`] on each lane.
+    /// Equivalent to calling [`pointer::addr`] on each element.
     fn addr(self) -> Self::Usize;
 
     /// Creates a new pointer with the given address.
@@ -44,7 +46,7 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// This performs the same operation as a cast, but copies the *address-space* and
     /// *provenance* of `self` to the new pointer.
     ///
-    /// Equivalent to calling [`pointer::with_addr`] on each lane.
+    /// Equivalent to calling [`pointer::with_addr`] on each element.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
@@ -53,37 +55,36 @@ pub trait SimdMutPtr: Copy + Sealed {
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`core::ptr::from_exposed_addr_mut`] on each lane.
+    /// Equivalent to calling [`core::ptr::from_exposed_addr_mut`] on each element.
     fn from_exposed_addr(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_offset`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_offset`] on each element.
     fn wrapping_offset(self, offset: Self::Isize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_add`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_add`] on each element.
     fn wrapping_add(self, count: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
-    /// Equivalent to calling [`pointer::wrapping_sub`] on each lane.
+    /// Equivalent to calling [`pointer::wrapping_sub`] on each element.
     fn wrapping_sub(self, count: Self::Usize) -> Self;
 }
 
-impl<T, const LANES: usize> Sealed for Simd<*mut T, LANES> where LaneCount<LANES>: SupportedLaneCount
-{}
+impl<T, const N: usize> Sealed for Simd<*mut T, N> where LaneCount<N>: SupportedLaneCount {}
 
-impl<T, const LANES: usize> SimdMutPtr for Simd<*mut T, LANES>
+impl<T, const N: usize> SimdMutPtr for Simd<*mut T, N>
 where
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    type Usize = Simd<usize, LANES>;
-    type Isize = Simd<isize, LANES>;
-    type CastPtr<U> = Simd<*mut U, LANES>;
-    type ConstPtr = Simd<*const T, LANES>;
-    type Mask = Mask<isize, LANES>;
+    type Usize = Simd<usize, N>;
+    type Isize = Simd<isize, N>;
+    type CastPtr<U> = Simd<*mut U, N>;
+    type ConstPtr = Simd<*const T, N>;
+    type Mask = Mask<isize, N>;
 
     #[inline]
     fn is_null(self) -> Self::Mask {
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 68f20516cf5bc..ec8548d557458 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -1,17 +1,15 @@
 use crate::simd::intrinsics;
-use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
+use crate::simd::{LaneCount, Mask, MaskElement, Simd, SimdElement, SupportedLaneCount};
 
-/// Constructs a new SIMD vector by copying elements from selected lanes in other vectors.
+/// Constructs a new SIMD vector by copying elements from selected elements in other vectors.
 ///
-/// When swizzling one vector, lanes are selected by a `const` array of `usize`,
-/// like [`Swizzle`].
+/// When swizzling one vector, elements are selected like [`Swizzle::swizzle`].
 ///
-/// When swizzling two vectors, lanes are selected by a `const` array of [`Which`],
-/// like [`Swizzle2`].
+/// When swizzling two vectors, elements are selected like [`Swizzle::concat_swizzle`].
 ///
 /// # Examples
 ///
-/// With a single SIMD vector, the const array specifies lane indices in that vector:
+/// With a single SIMD vector, the const array specifies element indices in that vector:
 /// ```
 /// # #![feature(portable_simd)]
 /// # use core::simd::{u32x2, u32x4, simd_swizzle};
@@ -21,25 +19,27 @@ use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 /// let r: u32x4 = simd_swizzle!(v, [3, 0, 1, 2]);
 /// assert_eq!(r.to_array(), [13, 10, 11, 12]);
 ///
-/// // Changing the number of lanes
+/// // Changing the number of elements
 /// let r: u32x2 = simd_swizzle!(v, [3, 1]);
 /// assert_eq!(r.to_array(), [13, 11]);
 /// ```
 ///
-/// With two input SIMD vectors, the const array uses `Which` to specify the source of each index:
+/// With two input SIMD vectors, the const array specifies element indices in the concatenation of
+/// those vectors:
 /// ```
 /// # #![feature(portable_simd)]
-/// # use core::simd::{u32x2, u32x4, simd_swizzle, Which};
-/// use Which::{First, Second};
+/// # #[cfg(feature = "as_crate")] use core_simd::simd;
+/// # #[cfg(not(feature = "as_crate"))] use core::simd;
+/// # use simd::{u32x2, u32x4, simd_swizzle};
 /// let a = u32x4::from_array([0, 1, 2, 3]);
 /// let b = u32x4::from_array([4, 5, 6, 7]);
 ///
 /// // Keeping the same size
-/// let r: u32x4 = simd_swizzle!(a, b, [First(0), First(1), Second(2), Second(3)]);
+/// let r: u32x4 = simd_swizzle!(a, b, [0, 1, 6, 7]);
 /// assert_eq!(r.to_array(), [0, 1, 6, 7]);
 ///
-/// // Changing the number of lanes
-/// let r: u32x2 = simd_swizzle!(a, b, [First(0), Second(0)]);
+/// // Changing the number of elements
+/// let r: u32x2 = simd_swizzle!(a, b, [0, 4]);
 /// assert_eq!(r.to_array(), [0, 4]);
 /// ```
 #[allow(unused_macros)]
@@ -50,7 +50,7 @@ pub macro simd_swizzle {
         {
             use $crate::simd::Swizzle;
             struct Impl;
-            impl<const LANES: usize> Swizzle<LANES, {$index.len()}> for Impl {
+            impl Swizzle<{$index.len()}> for Impl {
                 const INDEX: [usize; {$index.len()}] = $index;
             }
             Impl::swizzle($vector)
@@ -60,204 +60,194 @@ pub macro simd_swizzle {
         $first:expr, $second:expr, $index:expr $(,)?
     ) => {
         {
-            use $crate::simd::{Which, Swizzle2};
+            use $crate::simd::Swizzle;
             struct Impl;
-            impl<const LANES: usize> Swizzle2<LANES, {$index.len()}> for Impl {
-                const INDEX: [Which; {$index.len()}] = $index;
+            impl Swizzle<{$index.len()}> for Impl {
+                const INDEX: [usize; {$index.len()}] = $index;
             }
-            Impl::swizzle2($first, $second)
+            Impl::concat_swizzle($first, $second)
         }
     }
 }
 
-/// Specifies a lane index into one of two SIMD vectors.
-///
-/// This is an input type for [Swizzle2] and helper macros like [simd_swizzle].
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum Which {
-    /// Index of a lane in the first input SIMD vector.
-    First(usize),
-    /// Index of a lane in the second input SIMD vector.
-    Second(usize),
-}
-
 /// Create a vector from the elements of another vector.
-pub trait Swizzle<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
-    /// Map from the lanes of the input vector to the output vector.
-    const INDEX: [usize; OUTPUT_LANES];
+pub trait Swizzle<const N: usize> {
+    /// Map from the elements of the input vector to the output vector.
+    const INDEX: [usize; N];
 
-    /// Create a new vector from the lanes of `vector`.
+    /// Create a new vector from the elements of `vector`.
     ///
     /// Lane `i` of the output is `vector[Self::INDEX[i]]`.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    fn swizzle<T>(vector: Simd<T, INPUT_LANES>) -> Simd<T, OUTPUT_LANES>
+    fn swizzle<T, const M: usize>(vector: Simd<T, M>) -> Simd<T, N>
     where
         T: SimdElement,
-        LaneCount<INPUT_LANES>: SupportedLaneCount,
-        LaneCount<OUTPUT_LANES>: SupportedLaneCount,
+        LaneCount<N>: SupportedLaneCount,
+        LaneCount<M>: SupportedLaneCount,
     {
-        // Safety: `vector` is a vector, and `INDEX_IMPL` is a const array of u32.
-        unsafe { intrinsics::simd_shuffle(vector, vector, Self::INDEX_IMPL) }
+        // Safety: `vector` is a vector, and the index is a const array of u32.
+        unsafe {
+            intrinsics::simd_shuffle(
+                vector,
+                vector,
+                const {
+                    let mut output = [0; N];
+                    let mut i = 0;
+                    while i < N {
+                        let index = Self::INDEX[i];
+                        assert!(index as u32 as usize == index);
+                        assert!(
+                            index < M,
+                            "source element index exceeds input vector length"
+                        );
+                        output[i] = index as u32;
+                        i += 1;
+                    }
+                    output
+                },
+            )
+        }
     }
-}
-
-/// Create a vector from the elements of two other vectors.
-pub trait Swizzle2<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
-    /// Map from the lanes of the input vectors to the output vector
-    const INDEX: [Which; OUTPUT_LANES];
 
-    /// Create a new vector from the lanes of `first` and `second`.
+    /// Create a new vector from the elements of `first` and `second`.
     ///
-    /// Lane `i` is `first[j]` when `Self::INDEX[i]` is `First(j)`, or `second[j]` when it is
-    /// `Second(j)`.
+    /// Lane `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
+    /// `first` and `second`.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    fn swizzle2<T>(
-        first: Simd<T, INPUT_LANES>,
-        second: Simd<T, INPUT_LANES>,
-    ) -> Simd<T, OUTPUT_LANES>
+    fn concat_swizzle<T, const M: usize>(first: Simd<T, M>, second: Simd<T, M>) -> Simd<T, N>
     where
         T: SimdElement,
-        LaneCount<INPUT_LANES>: SupportedLaneCount,
-        LaneCount<OUTPUT_LANES>: SupportedLaneCount,
+        LaneCount<N>: SupportedLaneCount,
+        LaneCount<M>: SupportedLaneCount,
     {
-        // Safety: `first` and `second` are vectors, and `INDEX_IMPL` is a const array of u32.
-        unsafe { intrinsics::simd_shuffle(first, second, Self::INDEX_IMPL) }
-    }
-}
-
-/// The `simd_shuffle` intrinsic expects `u32`, so do error checking and conversion here.
-/// This trait hides `INDEX_IMPL` from the public API.
-trait SwizzleImpl<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
-    const INDEX_IMPL: [u32; OUTPUT_LANES];
-}
-
-impl<T, const INPUT_LANES: usize, const OUTPUT_LANES: usize> SwizzleImpl<INPUT_LANES, OUTPUT_LANES>
-    for T
-where
-    T: Swizzle<INPUT_LANES, OUTPUT_LANES> + ?Sized,
-{
-    const INDEX_IMPL: [u32; OUTPUT_LANES] = {
-        let mut output = [0; OUTPUT_LANES];
-        let mut i = 0;
-        while i < OUTPUT_LANES {
-            let index = Self::INDEX[i];
-            assert!(index as u32 as usize == index);
-            assert!(index < INPUT_LANES, "source lane exceeds input lane count",);
-            output[i] = index as u32;
-            i += 1;
+        // Safety: `first` and `second` are vectors, and the index is a const array of u32.
+        unsafe {
+            intrinsics::simd_shuffle(
+                first,
+                second,
+                const {
+                    let mut output = [0; N];
+                    let mut i = 0;
+                    while i < N {
+                        let index = Self::INDEX[i];
+                        assert!(index as u32 as usize == index);
+                        assert!(
+                            index < 2 * M,
+                            "source element index exceeds input vector length"
+                        );
+                        output[i] = index as u32;
+                        i += 1;
+                    }
+                    output
+                },
+            )
         }
-        output
-    };
-}
-
-/// The `simd_shuffle` intrinsic expects `u32`, so do error checking and conversion here.
-/// This trait hides `INDEX_IMPL` from the public API.
-trait Swizzle2Impl<const INPUT_LANES: usize, const OUTPUT_LANES: usize> {
-    const INDEX_IMPL: [u32; OUTPUT_LANES];
-}
+    }
 
-impl<T, const INPUT_LANES: usize, const OUTPUT_LANES: usize> Swizzle2Impl<INPUT_LANES, OUTPUT_LANES>
-    for T
-where
-    T: Swizzle2<INPUT_LANES, OUTPUT_LANES> + ?Sized,
-{
-    const INDEX_IMPL: [u32; OUTPUT_LANES] = {
-        let mut output = [0; OUTPUT_LANES];
-        let mut i = 0;
-        while i < OUTPUT_LANES {
-            let (offset, index) = match Self::INDEX[i] {
-                Which::First(index) => (false, index),
-                Which::Second(index) => (true, index),
-            };
-            assert!(index < INPUT_LANES, "source lane exceeds input lane count",);
+    /// Create a new mask from the elements of `mask`.
+    ///
+    /// Element `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
+    /// `first` and `second`.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
+    fn swizzle_mask<T, const M: usize>(mask: Mask<T, M>) -> Mask<T, N>
+    where
+        T: MaskElement,
+        LaneCount<N>: SupportedLaneCount,
+        LaneCount<M>: SupportedLaneCount,
+    {
+        // SAFETY: all elements of this mask come from another mask
+        unsafe { Mask::from_int_unchecked(Self::swizzle(mask.to_int())) }
+    }
 
-            // lanes are indexed by the first vector, then second vector
-            let index = if offset { index + INPUT_LANES } else { index };
-            assert!(index as u32 as usize == index);
-            output[i] = index as u32;
-            i += 1;
-        }
-        output
-    };
+    /// Create a new mask from the elements of `first` and `second`.
+    ///
+    /// Element `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
+    /// `first` and `second`.
+    #[inline]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
+    fn concat_swizzle_mask<T, const M: usize>(first: Mask<T, M>, second: Mask<T, M>) -> Mask<T, N>
+    where
+        T: MaskElement,
+        LaneCount<N>: SupportedLaneCount,
+        LaneCount<M>: SupportedLaneCount,
+    {
+        // SAFETY: all elements of this mask come from another mask
+        unsafe { Mask::from_int_unchecked(Self::concat_swizzle(first.to_int(), second.to_int())) }
+    }
 }
 
-impl<T, const LANES: usize> Simd<T, LANES>
+impl<T, const N: usize> Simd<T, N>
 where
     T: SimdElement,
-    LaneCount<LANES>: SupportedLaneCount,
+    LaneCount<N>: SupportedLaneCount,
 {
-    /// Reverse the order of the lanes in the vector.
+    /// Reverse the order of the elements in the vector.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn reverse(self) -> Self {
-        const fn reverse_index<const LANES: usize>() -> [usize; LANES] {
-            let mut index = [0; LANES];
-            let mut i = 0;
-            while i < LANES {
-                index[i] = LANES - i - 1;
-                i += 1;
-            }
-            index
-        }
-
         struct Reverse;
 
-        impl<const LANES: usize> Swizzle<LANES, LANES> for Reverse {
-            const INDEX: [usize; LANES] = reverse_index::<LANES>();
+        impl<const N: usize> Swizzle<N> for Reverse {
+            const INDEX: [usize; N] = const {
+                let mut index = [0; N];
+                let mut i = 0;
+                while i < N {
+                    index[i] = N - i - 1;
+                    i += 1;
+                }
+                index
+            };
         }
 
         Reverse::swizzle(self)
     }
 
     /// Rotates the vector such that the first `OFFSET` elements of the slice move to the end
-    /// while the last `LANES - OFFSET` elements move to the front. After calling `rotate_lanes_left`,
-    /// the element previously in lane `OFFSET` will become the first element in the slice.
+    /// while the last `self.len() - OFFSET` elements move to the front. After calling `rotate_elements_left`,
+    /// the element previously at index `OFFSET` will become the first element in the slice.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn rotate_lanes_left<const OFFSET: usize>(self) -> Self {
-        const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
-            let offset = OFFSET % LANES;
-            let mut index = [0; LANES];
-            let mut i = 0;
-            while i < LANES {
-                index[i] = (i + offset) % LANES;
-                i += 1;
-            }
-            index
-        }
-
+    pub fn rotate_elements_left<const OFFSET: usize>(self) -> Self {
         struct Rotate<const OFFSET: usize>;
 
-        impl<const OFFSET: usize, const LANES: usize> Swizzle<LANES, LANES> for Rotate<OFFSET> {
-            const INDEX: [usize; LANES] = rotate_index::<OFFSET, LANES>();
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Rotate<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let offset = OFFSET % N;
+                let mut index = [0; N];
+                let mut i = 0;
+                while i < N {
+                    index[i] = (i + offset) % N;
+                    i += 1;
+                }
+                index
+            };
         }
 
         Rotate::<OFFSET>::swizzle(self)
     }
 
-    /// Rotates the vector such that the first `LANES - OFFSET` elements of the vector move to
-    /// the end while the last `OFFSET` elements move to the front. After calling `rotate_lanes_right`,
-    /// the element previously at index `LANES - OFFSET` will become the first element in the slice.
+    /// Rotates the vector such that the first `self.len() - OFFSET` elements of the vector move to
+    /// the end while the last `OFFSET` elements move to the front. After calling `rotate_elements_right`,
+    /// the element previously at index `self.len() - OFFSET` will become the first element in the slice.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn rotate_lanes_right<const OFFSET: usize>(self) -> Self {
-        const fn rotate_index<const OFFSET: usize, const LANES: usize>() -> [usize; LANES] {
-            let offset = LANES - OFFSET % LANES;
-            let mut index = [0; LANES];
-            let mut i = 0;
-            while i < LANES {
-                index[i] = (i + offset) % LANES;
-                i += 1;
-            }
-            index
-        }
-
+    pub fn rotate_elements_right<const OFFSET: usize>(self) -> Self {
         struct Rotate<const OFFSET: usize>;
 
-        impl<const OFFSET: usize, const LANES: usize> Swizzle<LANES, LANES> for Rotate<OFFSET> {
-            const INDEX: [usize; LANES] = rotate_index::<OFFSET, LANES>();
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Rotate<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let offset = N - OFFSET % N;
+                let mut index = [0; N];
+                let mut i = 0;
+                while i < N {
+                    index[i] = (i + offset) % N;
+                    i += 1;
+                }
+                index
+            };
         }
 
         Rotate::<OFFSET>::swizzle(self)
@@ -265,7 +255,7 @@ where
 
     /// Interleave two vectors.
     ///
-    /// The resulting vectors contain lanes taken alternatively from `self` and `other`, first
+    /// The resulting vectors contain elements taken alternatively from `self` and `other`, first
     /// filling the first result, and then the second.
     ///
     /// The reverse of this operation is [`Simd::deinterleave`].
@@ -282,18 +272,13 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn interleave(self, other: Self) -> (Self, Self) {
-        const fn interleave<const LANES: usize>(high: bool) -> [Which; LANES] {
-            let mut idx = [Which::First(0); LANES];
+        const fn interleave<const N: usize>(high: bool) -> [usize; N] {
+            let mut idx = [0; N];
             let mut i = 0;
-            while i < LANES {
-                // Treat the source as a concatenated vector
-                let dst_index = if high { i + LANES } else { i };
-                let src_index = dst_index / 2 + (dst_index % 2) * LANES;
-                idx[i] = if src_index < LANES {
-                    Which::First(src_index)
-                } else {
-                    Which::Second(src_index % LANES)
-                };
+            while i < N {
+                let dst_index = if high { i + N } else { i };
+                let src_index = dst_index / 2 + (dst_index % 2) * N;
+                idx[i] = src_index;
                 i += 1;
             }
             idx
@@ -302,24 +287,27 @@ where
         struct Lo;
         struct Hi;
 
-        impl<const LANES: usize> Swizzle2<LANES, LANES> for Lo {
-            const INDEX: [Which; LANES] = interleave::<LANES>(false);
+        impl<const N: usize> Swizzle<N> for Lo {
+            const INDEX: [usize; N] = interleave::<N>(false);
         }
 
-        impl<const LANES: usize> Swizzle2<LANES, LANES> for Hi {
-            const INDEX: [Which; LANES] = interleave::<LANES>(true);
+        impl<const N: usize> Swizzle<N> for Hi {
+            const INDEX: [usize; N] = interleave::<N>(true);
         }
 
-        (Lo::swizzle2(self, other), Hi::swizzle2(self, other))
+        (
+            Lo::concat_swizzle(self, other),
+            Hi::concat_swizzle(self, other),
+        )
     }
 
     /// Deinterleave two vectors.
     ///
-    /// The first result takes every other lane of `self` and then `other`, starting with
-    /// the first lane.
+    /// The first result takes every other element of `self` and then `other`, starting with
+    /// the first element.
     ///
-    /// The second result takes every other lane of `self` and then `other`, starting with
-    /// the second lane.
+    /// The second result takes every other element of `self` and then `other`, starting with
+    /// the second element.
     ///
     /// The reverse of this operation is [`Simd::interleave`].
     ///
@@ -335,17 +323,11 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn deinterleave(self, other: Self) -> (Self, Self) {
-        const fn deinterleave<const LANES: usize>(second: bool) -> [Which; LANES] {
-            let mut idx = [Which::First(0); LANES];
+        const fn deinterleave<const N: usize>(second: bool) -> [usize; N] {
+            let mut idx = [0; N];
             let mut i = 0;
-            while i < LANES {
-                // Treat the source as a concatenated vector
-                let src_index = i * 2 + second as usize;
-                idx[i] = if src_index < LANES {
-                    Which::First(src_index)
-                } else {
-                    Which::Second(src_index % LANES)
-                };
+            while i < N {
+                idx[i] = i * 2 + second as usize;
                 i += 1;
             }
             idx
@@ -354,14 +336,52 @@ where
         struct Even;
         struct Odd;
 
-        impl<const LANES: usize> Swizzle2<LANES, LANES> for Even {
-            const INDEX: [Which; LANES] = deinterleave::<LANES>(false);
+        impl<const N: usize> Swizzle<N> for Even {
+            const INDEX: [usize; N] = deinterleave::<N>(false);
         }
 
-        impl<const LANES: usize> Swizzle2<LANES, LANES> for Odd {
-            const INDEX: [Which; LANES] = deinterleave::<LANES>(true);
+        impl<const N: usize> Swizzle<N> for Odd {
+            const INDEX: [usize; N] = deinterleave::<N>(true);
         }
 
-        (Even::swizzle2(self, other), Odd::swizzle2(self, other))
+        (
+            Even::concat_swizzle(self, other),
+            Odd::concat_swizzle(self, other),
+        )
+    }
+
+    /// Resize a vector.
+    ///
+    /// If `M` > `N`, extends the length of a vector, setting the new elements to `value`.
+    /// If `M` < `N`, truncates the vector to the first `M` elements.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
+    /// let x = u32x4::from_array([0, 1, 2, 3]);
+    /// assert_eq!(x.resize::<8>(9).to_array(), [0, 1, 2, 3, 9, 9, 9, 9]);
+    /// assert_eq!(x.resize::<2>(9).to_array(), [0, 1]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn resize<const M: usize>(self, value: T) -> Simd<T, M>
+    where
+        LaneCount<M>: SupportedLaneCount,
+    {
+        struct Resize<const N: usize>;
+        impl<const N: usize, const M: usize> Swizzle<M> for Resize<N> {
+            const INDEX: [usize; M] = const {
+                let mut index = [0; M];
+                let mut i = 0;
+                while i < M {
+                    index[i] = if i < N { i } else { N };
+                    i += 1;
+                }
+                index
+            };
+        }
+        Resize::<N>::concat_swizzle(self, Simd::splat(value))
     }
 }
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index ce621792534e1..bd8a38e350d3b 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -86,7 +86,7 @@ where
 #[inline]
 #[allow(clippy::let_and_return)]
 unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
-    use crate::simd::SimdPartialOrd;
+    use crate::simd::cmp::SimdPartialOrd;
     #[cfg(target_arch = "x86")]
     use core::arch::x86;
     #[cfg(target_arch = "x86_64")]
@@ -149,7 +149,7 @@ where
     // On x86, make sure the top bit is set.
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     let idxs = {
-        use crate::simd::SimdPartialOrd;
+        use crate::simd::cmp::SimdPartialOrd;
         idxs.simd_lt(Simd::splat(N as u8))
             .select(idxs, Simd::splat(u8::MAX))
     };
diff --git a/crates/core_simd/src/to_bytes.rs b/crates/core_simd/src/to_bytes.rs
index b36b1a347b226..222526c4ab30a 100644
--- a/crates/core_simd/src/to_bytes.rs
+++ b/crates/core_simd/src/to_bytes.rs
@@ -1,24 +1,125 @@
+use crate::simd::{
+    num::{SimdFloat, SimdInt, SimdUint},
+    LaneCount, Simd, SimdElement, SupportedLaneCount,
+};
+
+mod sealed {
+    use super::*;
+    pub trait Sealed {}
+    impl<T: SimdElement, const N: usize> Sealed for Simd<T, N> where LaneCount<N>: SupportedLaneCount {}
+}
+use sealed::Sealed;
+
+/// Convert SIMD vectors to vectors of bytes
+pub trait ToBytes: Sealed {
+    /// This type, reinterpreted as bytes.
+    type Bytes: Copy
+        + Unpin
+        + Send
+        + Sync
+        + AsRef<[u8]>
+        + AsMut<[u8]>
+        + SimdUint<Scalar = u8>
+        + 'static;
+
+    /// Return the memory representation of this integer as a byte array in native byte
+    /// order.
+    fn to_ne_bytes(self) -> Self::Bytes;
+
+    /// Return the memory representation of this integer as a byte array in big-endian
+    /// (network) byte order.
+    fn to_be_bytes(self) -> Self::Bytes;
+
+    /// Return the memory representation of this integer as a byte array in little-endian
+    /// byte order.
+    fn to_le_bytes(self) -> Self::Bytes;
+
+    /// Create a native endian integer value from its memory representation as a byte array
+    /// in native endianness.
+    fn from_ne_bytes(bytes: Self::Bytes) -> Self;
+
+    /// Create an integer value from its representation as a byte array in big endian.
+    fn from_be_bytes(bytes: Self::Bytes) -> Self;
+
+    /// Create an integer value from its representation as a byte array in little endian.
+    fn from_le_bytes(bytes: Self::Bytes) -> Self;
+}
+
+macro_rules! swap_bytes {
+    { f32, $x:expr } => { Simd::from_bits($x.to_bits().swap_bytes()) };
+    { f64, $x:expr } => { Simd::from_bits($x.to_bits().swap_bytes()) };
+    { $ty:ty, $x:expr } => { $x.swap_bytes() }
+}
+
 macro_rules! impl_to_bytes {
-    { $ty:ty, $size:literal } => {
-        impl<const LANES: usize> crate::simd::Simd<$ty, LANES>
-        where
-            crate::simd::LaneCount<LANES>: crate::simd::SupportedLaneCount,
-            crate::simd::LaneCount<{{ $size * LANES }}>: crate::simd::SupportedLaneCount,
-        {
-            /// Return the memory representation of this integer as a byte array in native byte
-            /// order.
-            pub fn to_ne_bytes(self) -> crate::simd::Simd<u8, {{ $size * LANES }}> {
+    { $ty:tt, 1  } => { impl_to_bytes! { $ty, 1  * [1, 2, 4, 8, 16, 32, 64] } };
+    { $ty:tt, 2  } => { impl_to_bytes! { $ty, 2  * [1, 2, 4, 8, 16, 32] } };
+    { $ty:tt, 4  } => { impl_to_bytes! { $ty, 4  * [1, 2, 4, 8, 16] } };
+    { $ty:tt, 8  } => { impl_to_bytes! { $ty, 8  * [1, 2, 4, 8] } };
+    { $ty:tt, 16 } => { impl_to_bytes! { $ty, 16 * [1, 2, 4] } };
+    { $ty:tt, 32 } => { impl_to_bytes! { $ty, 32 * [1, 2] } };
+    { $ty:tt, 64 } => { impl_to_bytes! { $ty, 64 * [1] } };
+
+    { $ty:tt, $size:literal * [$($elems:literal),*] } => {
+        $(
+        impl ToBytes for Simd<$ty, $elems> {
+            type Bytes = Simd<u8, { $size * $elems }>;
+
+            #[inline]
+            fn to_ne_bytes(self) -> Self::Bytes {
                 // Safety: transmuting between vectors is safe
-                unsafe { core::mem::transmute_copy(&self) }
+                unsafe {
+                    #![allow(clippy::useless_transmute)]
+                    core::mem::transmute(self)
+                }
+            }
+
+            #[inline]
+            fn to_be_bytes(mut self) -> Self::Bytes {
+                if !cfg!(target_endian = "big") {
+                    self = swap_bytes!($ty, self);
+                }
+                self.to_ne_bytes()
+            }
+
+            #[inline]
+            fn to_le_bytes(mut self) -> Self::Bytes {
+                if !cfg!(target_endian = "little") {
+                    self = swap_bytes!($ty, self);
+                }
+                self.to_ne_bytes()
             }
 
-            /// Create a native endian integer value from its memory representation as a byte array
-            /// in native endianness.
-            pub fn from_ne_bytes(bytes: crate::simd::Simd<u8, {{ $size * LANES }}>) -> Self {
+            #[inline]
+            fn from_ne_bytes(bytes: Self::Bytes) -> Self {
                 // Safety: transmuting between vectors is safe
-                unsafe { core::mem::transmute_copy(&bytes) }
+                unsafe {
+                    #![allow(clippy::useless_transmute)]
+                    core::mem::transmute(bytes)
+                }
+            }
+
+            #[inline]
+            fn from_be_bytes(bytes: Self::Bytes) -> Self {
+                let ret = Self::from_ne_bytes(bytes);
+                if cfg!(target_endian = "big") {
+                    ret
+                } else {
+                    swap_bytes!($ty, ret)
+                }
+            }
+
+            #[inline]
+            fn from_le_bytes(bytes: Self::Bytes) -> Self {
+                let ret = Self::from_ne_bytes(bytes);
+                if cfg!(target_endian = "little") {
+                    ret
+                } else {
+                    swap_bytes!($ty, ret)
+                }
             }
         }
+        )*
     }
 }
 
@@ -39,3 +140,6 @@ impl_to_bytes! { i64, 8 }
 impl_to_bytes! { isize, 4 }
 #[cfg(target_pointer_width = "64")]
 impl_to_bytes! { isize, 8 }
+
+impl_to_bytes! { f32, 4 }
+impl_to_bytes! { f64, 8 }
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 9aa7bacfce981..105c06741c58b 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,6 +1,8 @@
 use crate::simd::{
-    intrinsics, LaneCount, Mask, MaskElement, SimdConstPtr, SimdMutPtr, SimdPartialOrd,
-    SupportedLaneCount, Swizzle,
+    cmp::SimdPartialOrd,
+    intrinsics,
+    ptr::{SimdConstPtr, SimdMutPtr},
+    LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle,
 };
 use core::convert::{TryFrom, TryInto};
 
@@ -110,7 +112,7 @@ where
     T: SimdElement,
 {
     /// Number of elements in this vector.
-    pub const LANES: usize = N;
+    pub const LEN: usize = N;
 
     /// Returns the number of elements in this SIMD vector.
     ///
@@ -118,13 +120,16 @@ where
     ///
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::u32x4;
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
     /// let v = u32x4::splat(0);
-    /// assert_eq!(v.lanes(), 4);
+    /// assert_eq!(v.len(), 4);
     /// ```
     #[inline]
-    pub const fn lanes(&self) -> usize {
-        Self::LANES
+    #[allow(clippy::len_without_is_empty)]
+    pub const fn len(&self) -> usize {
+        Self::LEN
     }
 
     /// Constructs a new SIMD vector with all elements set to the given value.
@@ -133,7 +138,9 @@ where
     ///
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::u32x4;
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
     /// let v = u32x4::splat(8);
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
@@ -142,10 +149,10 @@ where
         // This is preferred over `[value; N]`, since it's explicitly a splat:
         // https://github.com/rust-lang/rust/issues/97804
         struct Splat;
-        impl<const N: usize> Swizzle<1, N> for Splat {
+        impl<const N: usize> Swizzle<N> for Splat {
             const INDEX: [usize; N] = [0; N];
         }
-        Splat::swizzle(Simd::<T, 1>::from([value]))
+        Splat::swizzle::<T, 1>(Simd::<T, 1>::from([value]))
     }
 
     /// Returns an array reference containing the entire SIMD vector.
@@ -271,7 +278,7 @@ where
     #[track_caller]
     pub const fn from_slice(slice: &[T]) -> Self {
         assert!(
-            slice.len() >= Self::LANES,
+            slice.len() >= Self::LEN,
             "slice length must be at least the number of elements"
         );
         // SAFETY: We just checked that the slice contains
@@ -301,7 +308,7 @@ where
     #[track_caller]
     pub fn copy_to_slice(self, slice: &mut [T]) {
         assert!(
-            slice.len() >= Self::LANES,
+            slice.len() >= Self::LEN,
             "slice length must be at least the number of elements"
         );
         // SAFETY: We just checked that the slice contains
@@ -394,7 +401,7 @@ where
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdPartialOrd, Mask};
+    /// # use simd::{Simd, cmp::SimdPartialOrd, Mask};
     /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 5]); // Includes an out-of-bounds index
     /// let alt = Simd::from_array([-5, -4, -3, -2]);
@@ -434,7 +441,7 @@ where
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdConstPtr};
+    /// # use simd::prelude::*;
     /// let values = [6, 2, 4, 9];
     /// let offsets = Simd::from_array([1, 0, 0, 3]);
     /// let source = Simd::splat(values.as_ptr()).wrapping_add(offsets);
@@ -467,7 +474,7 @@ where
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Mask, Simd, SimdConstPtr};
+    /// # use simd::prelude::*;
     /// let values = [6, 2, 4, 9];
     /// let enable = Mask::from_array([true, true, false, true]);
     /// let offsets = Simd::from_array([1, 0, 0, 3]);
@@ -550,7 +557,7 @@ where
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdPartialOrd, Mask};
+    /// # use simd::{Simd, cmp::SimdPartialOrd, Mask};
     /// let mut vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
     /// let idxs = Simd::from_array([9, 3, 0, 0]);
     /// let vals = Simd::from_array([-27, 82, -41, 124]);
@@ -604,7 +611,7 @@ where
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Simd, SimdMutPtr};
+    /// # use simd::{Simd, ptr::SimdMutPtr};
     /// let mut values = [0; 4];
     /// let offset = Simd::from_array([3, 2, 1, 0]);
     /// let ptrs = Simd::splat(values.as_mut_ptr()).wrapping_add(offset);
@@ -631,7 +638,7 @@ where
     /// # #![feature(portable_simd)]
     /// # #[cfg(feature = "as_crate")] use core_simd::simd;
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{Mask, Simd, SimdMutPtr};
+    /// # use simd::{Mask, Simd, ptr::SimdMutPtr};
     /// let mut values = [0; 4];
     /// let offset = Simd::from_array([3, 2, 1, 0]);
     /// let ptrs = Simd::splat(values.as_mut_ptr()).wrapping_add(offset);
diff --git a/crates/core_simd/src/vendor.rs b/crates/core_simd/src/vendor.rs
index 9fb70218c9543..6223bedb4e13c 100644
--- a/crates/core_simd/src/vendor.rs
+++ b/crates/core_simd/src/vendor.rs
@@ -21,7 +21,7 @@ macro_rules! from_transmute {
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
 
-#[cfg(any(target_arch = "wasm32"))]
+#[cfg(target_arch = "wasm32")]
 mod wasm32;
 
 #[cfg(any(target_arch = "aarch64", target_arch = "arm",))]
diff --git a/crates/core_simd/src/vendor/x86.rs b/crates/core_simd/src/vendor/x86.rs
index 0dd47015ed22f..66aaf90eef597 100644
--- a/crates/core_simd/src/vendor/x86.rs
+++ b/crates/core_simd/src/vendor/x86.rs
@@ -1,6 +1,6 @@
 use crate::simd::*;
 
-#[cfg(any(target_arch = "x86"))]
+#[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 
 #[cfg(target_arch = "x86_64")]
diff --git a/crates/core_simd/tests/cast.rs b/crates/core_simd/tests/cast.rs
index 00545936ea2a6..185e1945faa2f 100644
--- a/crates/core_simd/tests/cast.rs
+++ b/crates/core_simd/tests/cast.rs
@@ -3,7 +3,7 @@ macro_rules! cast_types {
     ($start:ident, $($target:ident),*) => {
         mod $start {
             #[allow(unused)]
-            use core_simd::simd::{Simd, SimdInt, SimdUint, SimdFloat};
+            use core_simd::simd::prelude::*;
             type Vector<const N: usize> = Simd<$start, N>;
             $(
                 mod $target {
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 9f8bad1c36c08..00fc2a24e27a6 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -72,7 +72,6 @@ macro_rules! test_mask_api {
 
             #[test]
             fn roundtrip_bitmask_conversion() {
-                use core_simd::simd::ToBitMask;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
@@ -85,8 +84,6 @@ macro_rules! test_mask_api {
 
             #[test]
             fn roundtrip_bitmask_conversion_short() {
-                use core_simd::simd::ToBitMask;
-
                 let values = [
                     false, false, false, true,
                 ];
@@ -125,18 +122,17 @@ macro_rules! test_mask_api {
                 cast_impl::<isize>();
             }
 
-            #[cfg(feature = "generic_const_exprs")]
             #[test]
-            fn roundtrip_bitmask_array_conversion() {
-                use core_simd::simd::ToBitMaskArray;
+            fn roundtrip_bitmask_vector_conversion() {
+                use core_simd::simd::ToBytes;
                 let values = [
                     true, false, false, true, false, false, true, false,
                     true, true, false, false, false, false, false, true,
                 ];
                 let mask = Mask::<$type, 16>::from_array(values);
-                let bitmask = mask.to_bitmask_array();
-                assert_eq!(bitmask, [0b01001001, 0b10000011]);
-                assert_eq!(Mask::<$type, 16>::from_bitmask_array(bitmask), mask);
+                let bitmask = mask.to_bitmask_vector();
+                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b01001001, 0b10000011]);
+                assert_eq!(Mask::<$type, 16>::from_bitmask_vector(bitmask), mask);
             }
         }
     }
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 3a02f3f01e1cf..aa565a137527e 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -6,7 +6,7 @@ macro_rules! impl_unary_op_test {
     { $scalar:ty, $trait:ident :: $fn:ident, $scalar_fn:expr } => {
         test_helpers::test_lanes! {
             fn $fn<const LANES: usize>() {
-                test_helpers::test_unary_elementwise(
+                test_helpers::test_unary_elementwise_flush_subnormals(
                     &<core_simd::simd::Simd<$scalar, LANES> as core::ops::$trait>::$fn,
                     &$scalar_fn,
                     &|_| true,
@@ -31,7 +31,7 @@ macro_rules! impl_binary_op_test {
 
             test_helpers::test_lanes! {
                 fn normal<const LANES: usize>() {
-                    test_helpers::test_binary_elementwise(
+                    test_helpers::test_binary_elementwise_flush_subnormals(
                         &<Simd<$scalar, LANES> as core::ops::$trait>::$fn,
                         &$scalar_fn,
                         &|_, _| true,
@@ -39,7 +39,7 @@ macro_rules! impl_binary_op_test {
                 }
 
                 fn assign<const LANES: usize>() {
-                    test_helpers::test_binary_elementwise(
+                    test_helpers::test_binary_elementwise_flush_subnormals(
                         &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
                         &$scalar_fn,
                         &|_, _| true,
@@ -68,6 +68,7 @@ macro_rules! impl_binary_checked_op_test {
 
             test_helpers::test_lanes! {
                 fn normal<const LANES: usize>() {
+                    #![allow(clippy::redundant_closure_call)]
                     test_helpers::test_binary_elementwise(
                         &<Simd<$scalar, LANES> as core::ops::$trait>::$fn,
                         &$scalar_fn,
@@ -76,6 +77,7 @@ macro_rules! impl_binary_checked_op_test {
                 }
 
                 fn assign<const LANES: usize>() {
+                    #![allow(clippy::redundant_closure_call)]
                     test_helpers::test_binary_elementwise(
                         &|mut a, b| { <Simd<$scalar, LANES> as core::ops::$trait_assign>::$fn_assign(&mut a, b); a },
                         &$scalar_fn,
@@ -94,11 +96,43 @@ macro_rules! impl_binary_checked_op_test {
 macro_rules! impl_common_integer_tests {
     { $vector:ident, $scalar:ident } => {
         test_helpers::test_lanes! {
+            fn shr<const LANES: usize>() {
+                use core::ops::Shr;
+                let shr = |x: $scalar, y: $scalar| x.wrapping_shr(y as _);
+                test_helpers::test_binary_elementwise(
+                    &<$vector::<LANES> as Shr<$vector::<LANES>>>::shr,
+                    &shr,
+                    &|_, _| true,
+                );
+                test_helpers::test_binary_scalar_rhs_elementwise(
+                    &<$vector::<LANES> as Shr<$scalar>>::shr,
+                    &shr,
+                    &|_, _| true,
+                );
+            }
+
+            fn shl<const LANES: usize>() {
+                use core::ops::Shl;
+                let shl = |x: $scalar, y: $scalar| x.wrapping_shl(y as _);
+                test_helpers::test_binary_elementwise(
+                    &<$vector::<LANES> as Shl<$vector::<LANES>>>::shl,
+                    &shl,
+                    &|_, _| true,
+                );
+                test_helpers::test_binary_scalar_rhs_elementwise(
+                    &<$vector::<LANES> as Shl<$scalar>>::shl,
+                    &shl,
+                    &|_, _| true,
+                );
+            }
+
             fn reduce_sum<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
+                    use test_helpers::subnormals::{flush, flush_in};
                     test_helpers::prop_assert_biteq! (
                         $vector::<LANES>::from_array(x).reduce_sum(),
                         x.iter().copied().fold(0 as $scalar, $scalar::wrapping_add),
+                        flush(x.iter().copied().map(flush_in).fold(0 as $scalar, $scalar::wrapping_add)),
                     );
                     Ok(())
                 });
@@ -106,9 +140,11 @@ macro_rules! impl_common_integer_tests {
 
             fn reduce_product<const LANES: usize>() {
                 test_helpers::test_1(&|x| {
+                    use test_helpers::subnormals::{flush, flush_in};
                     test_helpers::prop_assert_biteq! (
                         $vector::<LANES>::from_array(x).reduce_product(),
                         x.iter().copied().fold(1 as $scalar, $scalar::wrapping_mul),
+                        flush(x.iter().copied().map(flush_in).fold(1 as $scalar, $scalar::wrapping_mul)),
                     );
                     Ok(())
                 });
@@ -163,6 +199,54 @@ macro_rules! impl_common_integer_tests {
                     Ok(())
                 });
             }
+
+            fn swap_bytes<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::swap_bytes,
+                    &$scalar::swap_bytes,
+                    &|_| true,
+                )
+            }
+
+            fn reverse_bits<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::reverse_bits,
+                    &$scalar::reverse_bits,
+                    &|_| true,
+                )
+            }
+
+            fn leading_zeros<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::leading_zeros,
+                    &|x| x.leading_zeros() as _,
+                    &|_| true,
+                )
+            }
+
+            fn trailing_zeros<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::trailing_zeros,
+                    &|x| x.trailing_zeros() as _,
+                    &|_| true,
+                )
+            }
+
+            fn leading_ones<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::leading_ones,
+                    &|x| x.leading_ones() as _,
+                    &|_| true,
+                )
+            }
+
+            fn trailing_ones<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::trailing_ones,
+                    &|x| x.trailing_ones() as _,
+                    &|_| true,
+                )
+            }
         }
     }
 }
@@ -172,7 +256,7 @@ macro_rules! impl_common_integer_tests {
 macro_rules! impl_signed_tests {
     { $scalar:tt } => {
         mod $scalar {
-            use core_simd::simd::SimdInt;
+            use core_simd::simd::num::SimdInt;
             type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
@@ -224,7 +308,7 @@ macro_rules! impl_signed_tests {
                 }
 
                 fn simd_min<const LANES: usize>() {
-                    use core_simd::simd::SimdOrd;
+                    use core_simd::simd::cmp::SimdOrd;
                     let a = Vector::<LANES>::splat(Scalar::MIN);
                     let b = Vector::<LANES>::splat(0);
                     assert_eq!(a.simd_min(b), a);
@@ -234,7 +318,7 @@ macro_rules! impl_signed_tests {
                 }
 
                 fn simd_max<const LANES: usize>() {
-                    use core_simd::simd::SimdOrd;
+                    use core_simd::simd::cmp::SimdOrd;
                     let a = Vector::<LANES>::splat(Scalar::MIN);
                     let b = Vector::<LANES>::splat(0);
                     assert_eq!(a.simd_max(b), b);
@@ -244,7 +328,7 @@ macro_rules! impl_signed_tests {
                 }
 
                 fn simd_clamp<const LANES: usize>() {
-                    use core_simd::simd::SimdOrd;
+                    use core_simd::simd::cmp::SimdOrd;
                     let min = Vector::<LANES>::splat(Scalar::MIN);
                     let max = Vector::<LANES>::splat(Scalar::MAX);
                     let zero = Vector::<LANES>::splat(0);
@@ -313,7 +397,7 @@ macro_rules! impl_signed_tests {
 macro_rules! impl_unsigned_tests {
     { $scalar:tt } => {
         mod $scalar {
-            use core_simd::simd::SimdUint;
+            use core_simd::simd::num::SimdUint;
             type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
@@ -327,6 +411,16 @@ macro_rules! impl_unsigned_tests {
                 }
             }
 
+            test_helpers::test_lanes! {
+                fn wrapping_neg<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise(
+                        &Vector::<LANES>::wrapping_neg,
+                        &Scalar::wrapping_neg,
+                        &|_| true,
+                    );
+                }
+            }
+
             impl_binary_op_test!(Scalar, Add::add, AddAssign::add_assign, Scalar::wrapping_add);
             impl_binary_op_test!(Scalar, Sub::sub, SubAssign::sub_assign, Scalar::wrapping_sub);
             impl_binary_op_test!(Scalar, Mul::mul, MulAssign::mul_assign, Scalar::wrapping_mul);
@@ -348,7 +442,7 @@ macro_rules! impl_unsigned_tests {
 macro_rules! impl_float_tests {
     { $scalar:tt, $int_scalar:tt } => {
         mod $scalar {
-            use core_simd::simd::SimdFloat;
+            use core_simd::simd::num::SimdFloat;
             type Vector<const LANES: usize> = core_simd::simd::Simd<Scalar, LANES>;
             type Scalar = $scalar;
 
@@ -433,7 +527,7 @@ macro_rules! impl_float_tests {
                 }
 
                 fn to_degrees<const LANES: usize>() {
-                    test_helpers::test_unary_elementwise(
+                    test_helpers::test_unary_elementwise_flush_subnormals(
                         &Vector::<LANES>::to_degrees,
                         &Scalar::to_degrees,
                         &|_| true,
@@ -441,7 +535,7 @@ macro_rules! impl_float_tests {
                 }
 
                 fn to_radians<const LANES: usize>() {
-                    test_helpers::test_unary_elementwise(
+                    test_helpers::test_unary_elementwise_flush_subnormals(
                         &Vector::<LANES>::to_radians,
                         &Scalar::to_radians,
                         &|_| true,
@@ -511,7 +605,12 @@ macro_rules! impl_float_tests {
                 }
 
                 fn simd_clamp<const LANES: usize>() {
+                    if cfg!(all(target_arch = "powerpc64", target_feature = "vsx")) {
+                        // https://gitlab.com/qemu-project/qemu/-/issues/1780
+                        return;
+                    }
                     test_helpers::test_3(&|value: [Scalar; LANES], mut min: [Scalar; LANES], mut max: [Scalar; LANES]| {
+                        use test_helpers::subnormals::flush_in;
                         for (min, max) in min.iter_mut().zip(max.iter_mut()) {
                             if max < min {
                                 core::mem::swap(min, max);
@@ -528,8 +627,20 @@ macro_rules! impl_float_tests {
                         for i in 0..LANES {
                             result_scalar[i] = value[i].clamp(min[i], max[i]);
                         }
+                        let mut result_scalar_flush = [Scalar::default(); LANES];
+                        for i in 0..LANES {
+                            // Comparisons flush-to-zero, but return value selection is _not_ flushed.
+                            let mut value = value[i];
+                            if flush_in(value) < flush_in(min[i]) {
+                                value = min[i];
+                            }
+                            if flush_in(value) > flush_in(max[i]) {
+                                value = max[i];
+                            }
+                            result_scalar_flush[i] = value
+                        }
                         let result_vector = Vector::from_array(value).simd_clamp(min.into(), max.into()).to_array();
-                        test_helpers::prop_assert_biteq!(result_scalar, result_vector);
+                        test_helpers::prop_assert_biteq!(result_vector, result_scalar, result_scalar_flush);
                         Ok(())
                     })
                 }
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index 0ae8f83b8b97d..a90ff928cedcb 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -1,6 +1,9 @@
 #![feature(portable_simd, strict_provenance)]
 
-use core_simd::simd::{Simd, SimdConstPtr, SimdMutPtr};
+use core_simd::simd::{
+    ptr::{SimdConstPtr, SimdMutPtr},
+    Simd,
+};
 
 macro_rules! common_tests {
     { $constness:ident } => {
diff --git a/crates/core_simd/tests/round.rs b/crates/core_simd/tests/round.rs
index aacf7bd3bcc2c..847766ec41ed2 100644
--- a/crates/core_simd/tests/round.rs
+++ b/crates/core_simd/tests/round.rs
@@ -43,7 +43,7 @@ macro_rules! float_rounding_test {
                 }
 
                 fn fract<const LANES: usize>() {
-                    test_helpers::test_unary_elementwise(
+                    test_helpers::test_unary_elementwise_flush_subnormals(
                         &Vector::<LANES>::fract,
                         &Scalar::fract,
                         &|_| true,
@@ -53,7 +53,7 @@ macro_rules! float_rounding_test {
 
             test_helpers::test_lanes! {
                 fn to_int_unchecked<const LANES: usize>() {
-                    use core_simd::simd::SimdFloat;
+                    use core_simd::simd::num::SimdFloat;
                     // The maximum integer that can be represented by the equivalently sized float has
                     // all of the mantissa digits set to 1, pushed up to the MSB.
                     const ALL_MANTISSA_BITS: IntScalar = ((1 << <Scalar>::MANTISSA_DIGITS) - 1);
diff --git a/crates/core_simd/tests/swizzle.rs b/crates/core_simd/tests/swizzle.rs
index 8cd7c33e823fb..522d71439b77d 100644
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@@ -11,10 +11,10 @@ wasm_bindgen_test_configure!(run_in_browser);
 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
 fn swizzle() {
     struct Index;
-    impl Swizzle<4, 4> for Index {
+    impl Swizzle<4> for Index {
         const INDEX: [usize; 4] = [2, 1, 3, 0];
     }
-    impl Swizzle<4, 2> for Index {
+    impl Swizzle<2> for Index {
         const INDEX: [usize; 2] = [1, 1];
     }
 
@@ -34,18 +34,18 @@ fn reverse() {
 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
 fn rotate() {
     let a = Simd::from_array([1, 2, 3, 4]);
-    assert_eq!(a.rotate_lanes_left::<0>().to_array(), [1, 2, 3, 4]);
-    assert_eq!(a.rotate_lanes_left::<1>().to_array(), [2, 3, 4, 1]);
-    assert_eq!(a.rotate_lanes_left::<2>().to_array(), [3, 4, 1, 2]);
-    assert_eq!(a.rotate_lanes_left::<3>().to_array(), [4, 1, 2, 3]);
-    assert_eq!(a.rotate_lanes_left::<4>().to_array(), [1, 2, 3, 4]);
-    assert_eq!(a.rotate_lanes_left::<5>().to_array(), [2, 3, 4, 1]);
-    assert_eq!(a.rotate_lanes_right::<0>().to_array(), [1, 2, 3, 4]);
-    assert_eq!(a.rotate_lanes_right::<1>().to_array(), [4, 1, 2, 3]);
-    assert_eq!(a.rotate_lanes_right::<2>().to_array(), [3, 4, 1, 2]);
-    assert_eq!(a.rotate_lanes_right::<3>().to_array(), [2, 3, 4, 1]);
-    assert_eq!(a.rotate_lanes_right::<4>().to_array(), [1, 2, 3, 4]);
-    assert_eq!(a.rotate_lanes_right::<5>().to_array(), [4, 1, 2, 3]);
+    assert_eq!(a.rotate_elements_left::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_elements_left::<1>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_elements_left::<2>().to_array(), [3, 4, 1, 2]);
+    assert_eq!(a.rotate_elements_left::<3>().to_array(), [4, 1, 2, 3]);
+    assert_eq!(a.rotate_elements_left::<4>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_elements_left::<5>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_elements_right::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_elements_right::<1>().to_array(), [4, 1, 2, 3]);
+    assert_eq!(a.rotate_elements_right::<2>().to_array(), [3, 4, 1, 2]);
+    assert_eq!(a.rotate_elements_right::<3>().to_array(), [2, 3, 4, 1]);
+    assert_eq!(a.rotate_elements_right::<4>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.rotate_elements_right::<5>().to_array(), [4, 1, 2, 3]);
 }
 
 #[test]
diff --git a/crates/core_simd/tests/swizzle_dyn.rs b/crates/core_simd/tests/swizzle_dyn.rs
index 646cd5f338335..f21a937f01c44 100644
--- a/crates/core_simd/tests/swizzle_dyn.rs
+++ b/crates/core_simd/tests/swizzle_dyn.rs
@@ -1,6 +1,5 @@
 #![feature(portable_simd)]
 use core::{fmt, ops::RangeInclusive};
-use proptest;
 use test_helpers::{self, biteq, make_runner, prop_assert_biteq};
 
 fn swizzle_dyn_scalar_ver<const N: usize>(values: [u8; N], idxs: [u8; N]) -> [u8; N] {
diff --git a/crates/core_simd/tests/to_bytes.rs b/crates/core_simd/tests/to_bytes.rs
index be0ee4349c579..66a7981cdc3dd 100644
--- a/crates/core_simd/tests/to_bytes.rs
+++ b/crates/core_simd/tests/to_bytes.rs
@@ -1,14 +1,20 @@
-#![feature(portable_simd, generic_const_exprs, adt_const_params)]
-#![allow(incomplete_features)]
-#![cfg(feature = "generic_const_exprs")]
+#![feature(portable_simd)]
 
-use core_simd::simd::Simd;
+use core_simd::simd::{Simd, ToBytes};
 
 #[test]
 fn byte_convert() {
     let int = Simd::<u32, 2>::from_array([0xdeadbeef, 0x8badf00d]);
-    let bytes = int.to_ne_bytes();
-    assert_eq!(int[0].to_ne_bytes(), bytes[..4]);
-    assert_eq!(int[1].to_ne_bytes(), bytes[4..]);
-    assert_eq!(Simd::<u32, 2>::from_ne_bytes(bytes), int);
+    let ne_bytes = int.to_ne_bytes();
+    let be_bytes = int.to_be_bytes();
+    let le_bytes = int.to_le_bytes();
+    assert_eq!(int[0].to_ne_bytes(), ne_bytes[..4]);
+    assert_eq!(int[1].to_ne_bytes(), ne_bytes[4..]);
+    assert_eq!(int[0].to_be_bytes(), be_bytes[..4]);
+    assert_eq!(int[1].to_be_bytes(), be_bytes[4..]);
+    assert_eq!(int[0].to_le_bytes(), le_bytes[..4]);
+    assert_eq!(int[1].to_le_bytes(), le_bytes[4..]);
+    assert_eq!(Simd::<u32, 2>::from_ne_bytes(ne_bytes), int);
+    assert_eq!(Simd::<u32, 2>::from_be_bytes(be_bytes), int);
+    assert_eq!(Simd::<u32, 2>::from_le_bytes(le_bytes), int);
 }
diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
index 4ac60b10c92ef..1fef17242ca85 100644
--- a/crates/std_float/src/lib.rs
+++ b/crates/std_float/src/lib.rs
@@ -1,5 +1,10 @@
 #![cfg_attr(feature = "as_crate", no_std)] // We are std!
-#![cfg_attr(feature = "as_crate", feature(platform_intrinsics), feature(portable_simd))]
+#![cfg_attr(
+    feature = "as_crate",
+    feature(platform_intrinsics),
+    feature(portable_simd),
+    allow(internal_features)
+)]
 #[cfg(not(feature = "as_crate"))]
 use core::simd;
 #[cfg(feature = "as_crate")]
@@ -144,7 +149,7 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
-    use simd::*;
+    use simd::prelude::*;
 
     #[test]
     fn everything_works() {
diff --git a/crates/test_helpers/Cargo.toml b/crates/test_helpers/Cargo.toml
index 1d2bc8b519aa6..23dae7c93381e 100644
--- a/crates/test_helpers/Cargo.toml
+++ b/crates/test_helpers/Cargo.toml
@@ -4,10 +4,8 @@ version = "0.1.0"
 edition = "2021"
 publish = false
 
-[dependencies.proptest]
-version = "0.10"
-default-features = false
-features = ["alloc"]
+[dependencies]
+proptest = { version = "0.10", default-features = false, features = ["alloc"] }
 
 [features]
 all_lane_counts = []
diff --git a/crates/test_helpers/src/biteq.rs b/crates/test_helpers/src/biteq.rs
index 7d91260d838a6..cbc20cda0d626 100644
--- a/crates/test_helpers/src/biteq.rs
+++ b/crates/test_helpers/src/biteq.rs
@@ -113,6 +113,27 @@ impl<T: BitEq> core::fmt::Debug for BitEqWrapper<'_, T> {
     }
 }
 
+#[doc(hidden)]
+pub struct BitEqEitherWrapper<'a, T>(pub &'a T, pub &'a T);
+
+impl<T: BitEq> PartialEq<BitEqEitherWrapper<'_, T>> for BitEqWrapper<'_, T> {
+    fn eq(&self, other: &BitEqEitherWrapper<'_, T>) -> bool {
+        self.0.biteq(other.0) || self.0.biteq(other.1)
+    }
+}
+
+impl<T: BitEq> core::fmt::Debug for BitEqEitherWrapper<'_, T> {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        if self.0.biteq(self.1) {
+            self.0.fmt(f)
+        } else {
+            self.0.fmt(f)?;
+            write!(f, " or ")?;
+            self.1.fmt(f)
+        }
+    }
+}
+
 #[macro_export]
 macro_rules! prop_assert_biteq {
     { $a:expr, $b:expr $(,)? } => {
@@ -122,5 +143,14 @@ macro_rules! prop_assert_biteq {
             let b = $b;
             proptest::prop_assert_eq!(BitEqWrapper(&a), BitEqWrapper(&b));
         }
-    }
+    };
+    { $a:expr, $b:expr, $c:expr $(,)? } => {
+        {
+            use $crate::biteq::{BitEqWrapper, BitEqEitherWrapper};
+            let a = $a;
+            let b = $b;
+            let c = $c;
+            proptest::prop_assert_eq!(BitEqWrapper(&a), BitEqEitherWrapper(&b, &c));
+        }
+    };
 }
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index b26cdc311a215..b80c745aaf2fb 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -1,3 +1,5 @@
+#![feature(stdsimd, powerpc_target_feature)]
+
 pub mod array;
 
 #[cfg(target_arch = "wasm32")]
@@ -6,6 +8,9 @@ pub mod wasm;
 #[macro_use]
 pub mod biteq;
 
+pub mod subnormals;
+use subnormals::FlushSubnormals;
+
 /// Specifies the default strategy for testing a type.
 ///
 /// This strategy should be what "makes sense" to test.
@@ -151,7 +156,6 @@ pub fn test_3<
 }
 
 /// Test a unary vector function against a unary scalar function, applied elementwise.
-#[inline(never)]
 pub fn test_unary_elementwise<Scalar, ScalarResult, Vector, VectorResult, const LANES: usize>(
     fv: &dyn Fn(Vector) -> VectorResult,
     fs: &dyn Fn(Scalar) -> ScalarResult,
@@ -177,6 +181,48 @@ pub fn test_unary_elementwise<Scalar, ScalarResult, Vector, VectorResult, const
     });
 }
 
+/// Test a unary vector function against a unary scalar function, applied elementwise.
+///
+/// Where subnormals are flushed, use approximate equality.
+pub fn test_unary_elementwise_flush_subnormals<
+    Scalar,
+    ScalarResult,
+    Vector,
+    VectorResult,
+    const LANES: usize,
+>(
+    fv: &dyn Fn(Vector) -> VectorResult,
+    fs: &dyn Fn(Scalar) -> ScalarResult,
+    check: &dyn Fn([Scalar; LANES]) -> bool,
+) where
+    Scalar: Copy + core::fmt::Debug + DefaultStrategy + FlushSubnormals,
+    ScalarResult: Copy + biteq::BitEq + core::fmt::Debug + DefaultStrategy + FlushSubnormals,
+    Vector: Into<[Scalar; LANES]> + From<[Scalar; LANES]> + Copy,
+    VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
+{
+    let flush = |x: Scalar| subnormals::flush(fs(subnormals::flush_in(x)));
+    test_1(&|x: [Scalar; LANES]| {
+        proptest::prop_assume!(check(x));
+        let result_v: [ScalarResult; LANES] = fv(x.into()).into();
+        let result_s: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .map(fs)
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        let result_sf: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .map(flush)
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        crate::prop_assert_biteq!(result_v, result_s, result_sf);
+        Ok(())
+    });
+}
+
 /// Test a unary vector function against a unary scalar function, applied elementwise.
 #[inline(never)]
 pub fn test_unary_mask_elementwise<Scalar, Vector, Mask, const LANES: usize>(
@@ -204,7 +250,6 @@ pub fn test_unary_mask_elementwise<Scalar, Vector, Mask, const LANES: usize>(
 }
 
 /// Test a binary vector function against a binary scalar function, applied elementwise.
-#[inline(never)]
 pub fn test_binary_elementwise<
     Scalar1,
     Scalar2,
@@ -241,6 +286,85 @@ pub fn test_binary_elementwise<
     });
 }
 
+/// Test a binary vector function against a binary scalar function, applied elementwise.
+///
+/// Where subnormals are flushed, use approximate equality.
+pub fn test_binary_elementwise_flush_subnormals<
+    Scalar1,
+    Scalar2,
+    ScalarResult,
+    Vector1,
+    Vector2,
+    VectorResult,
+    const LANES: usize,
+>(
+    fv: &dyn Fn(Vector1, Vector2) -> VectorResult,
+    fs: &dyn Fn(Scalar1, Scalar2) -> ScalarResult,
+    check: &dyn Fn([Scalar1; LANES], [Scalar2; LANES]) -> bool,
+) where
+    Scalar1: Copy + core::fmt::Debug + DefaultStrategy + FlushSubnormals,
+    Scalar2: Copy + core::fmt::Debug + DefaultStrategy + FlushSubnormals,
+    ScalarResult: Copy + biteq::BitEq + core::fmt::Debug + DefaultStrategy + FlushSubnormals,
+    Vector1: Into<[Scalar1; LANES]> + From<[Scalar1; LANES]> + Copy,
+    Vector2: Into<[Scalar2; LANES]> + From<[Scalar2; LANES]> + Copy,
+    VectorResult: Into<[ScalarResult; LANES]> + From<[ScalarResult; LANES]> + Copy,
+{
+    let flush = |x: Scalar1, y: Scalar2| {
+        subnormals::flush(fs(subnormals::flush_in(x), subnormals::flush_in(y)))
+    };
+    test_2(&|x: [Scalar1; LANES], y: [Scalar2; LANES]| {
+        proptest::prop_assume!(check(x, y));
+        let result_v: [ScalarResult; LANES] = fv(x.into(), y.into()).into();
+        let result_s: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .zip(y.iter().copied())
+            .map(|(x, y)| fs(x, y))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        let result_sf: [ScalarResult; LANES] = x
+            .iter()
+            .copied()
+            .zip(y.iter().copied())
+            .map(|(x, y)| flush(x, y))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        crate::prop_assert_biteq!(result_v, result_s, result_sf);
+        Ok(())
+    });
+}
+
+/// Test a unary vector function against a unary scalar function, applied elementwise.
+#[inline(never)]
+pub fn test_binary_mask_elementwise<Scalar1, Scalar2, Vector1, Vector2, Mask, const LANES: usize>(
+    fv: &dyn Fn(Vector1, Vector2) -> Mask,
+    fs: &dyn Fn(Scalar1, Scalar2) -> bool,
+    check: &dyn Fn([Scalar1; LANES], [Scalar2; LANES]) -> bool,
+) where
+    Scalar1: Copy + core::fmt::Debug + DefaultStrategy,
+    Scalar2: Copy + core::fmt::Debug + DefaultStrategy,
+    Vector1: Into<[Scalar1; LANES]> + From<[Scalar1; LANES]> + Copy,
+    Vector2: Into<[Scalar2; LANES]> + From<[Scalar2; LANES]> + Copy,
+    Mask: Into<[bool; LANES]> + From<[bool; LANES]> + Copy,
+{
+    test_2(&|x: [Scalar1; LANES], y: [Scalar2; LANES]| {
+        proptest::prop_assume!(check(x, y));
+        let result_v: [bool; LANES] = fv(x.into(), y.into()).into();
+        let result_s: [bool; LANES] = x
+            .iter()
+            .copied()
+            .zip(y.iter().copied())
+            .map(|(x, y)| fs(x, y))
+            .collect::<Vec<_>>()
+            .try_into()
+            .unwrap();
+        crate::prop_assert_biteq!(result_v, result_s);
+        Ok(())
+    });
+}
+
 /// Test a binary vector-scalar function against a binary scalar function, applied elementwise.
 #[inline(never)]
 pub fn test_binary_scalar_rhs_elementwise<
diff --git a/crates/test_helpers/src/subnormals.rs b/crates/test_helpers/src/subnormals.rs
new file mode 100644
index 0000000000000..ec0f1fb24b936
--- /dev/null
+++ b/crates/test_helpers/src/subnormals.rs
@@ -0,0 +1,91 @@
+pub trait FlushSubnormals: Sized {
+    fn flush(self) -> Self {
+        self
+    }
+}
+
+impl<T> FlushSubnormals for *const T {}
+impl<T> FlushSubnormals for *mut T {}
+
+macro_rules! impl_float {
+    { $($ty:ty),* } => {
+        $(
+        impl FlushSubnormals for $ty {
+            fn flush(self) -> Self {
+                let is_f32 = core::mem::size_of::<Self>() == 4;
+                let ppc_flush = is_f32 && cfg!(all(
+                    any(target_arch = "powerpc", all(target_arch = "powerpc64", target_endian = "big")),
+                    target_feature = "altivec",
+                    not(target_feature = "vsx"),
+                ));
+                let arm_flush = is_f32 && cfg!(all(target_arch = "arm", target_feature = "neon"));
+                let flush = ppc_flush || arm_flush;
+                if flush && self.is_subnormal() {
+                    <$ty>::copysign(0., self)
+                } else {
+                    self
+                }
+            }
+        }
+        )*
+    }
+}
+
+macro_rules! impl_else {
+    { $($ty:ty),* } => {
+        $(
+        impl FlushSubnormals for $ty {}
+        )*
+    }
+}
+
+impl_float! { f32, f64 }
+impl_else! { i8, i16, i32, i64, isize, u8, u16, u32, u64, usize }
+
+/// AltiVec should flush subnormal inputs to zero, but QEMU seems to only flush outputs.
+/// https://gitlab.com/qemu-project/qemu/-/issues/1779
+#[cfg(all(
+    any(target_arch = "powerpc", target_arch = "powerpc64"),
+    target_feature = "altivec"
+))]
+fn in_buggy_qemu() -> bool {
+    use std::sync::OnceLock;
+    static BUGGY: OnceLock<bool> = OnceLock::new();
+
+    fn add(x: f32, y: f32) -> f32 {
+        #[cfg(target_arch = "powerpc")]
+        use core::arch::powerpc::*;
+        #[cfg(target_arch = "powerpc64")]
+        use core::arch::powerpc64::*;
+
+        let array: [f32; 4] =
+            unsafe { core::mem::transmute(vec_add(vec_splats(x), vec_splats(y))) };
+        array[0]
+    }
+
+    *BUGGY.get_or_init(|| add(-1.0857398e-38, 0.).is_sign_negative())
+}
+
+#[cfg(all(
+    any(target_arch = "powerpc", target_arch = "powerpc64"),
+    target_feature = "altivec"
+))]
+pub fn flush_in<T: FlushSubnormals>(x: T) -> T {
+    if in_buggy_qemu() {
+        x
+    } else {
+        x.flush()
+    }
+}
+
+#[cfg(not(all(
+    any(target_arch = "powerpc", target_arch = "powerpc64"),
+    target_feature = "altivec"
+)))]
+pub fn flush_in<T: FlushSubnormals>(x: T) -> T {
+    x.flush()
+}
+
+pub fn flush<T: FlushSubnormals>(x: T) -> T {
+    x.flush()
+}

From c6b3b35badd256fa15eea21e707072d8d18e6eed Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 2 Dec 2023 10:49:21 -0500
Subject: [PATCH 05/74] Remove link to core::arch::x86_64

---
 crates/core_simd/src/core_simd_docs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/core_simd_docs.md b/crates/core_simd/src/core_simd_docs.md
index fa93155ff5ed1..bf412e035b513 100644
--- a/crates/core_simd/src/core_simd_docs.md
+++ b/crates/core_simd/src/core_simd_docs.md
@@ -30,7 +30,7 @@ Instead, they map to a reasonable implementation of the operation for the target
 
 Consistency between targets is not compromised to use faster or fewer instructions.
 In some cases, `std::arch` will provide a faster function that has slightly different behavior than the `std::simd` equivalent.
-For example, [`_mm_min_ps`](`core::arch::x86_64::_mm_min_ps`)[^1] can be slightly faster than [`SimdFloat::simd_min`](`num::SimdFloat::simd_min`), but does not conform to the IEEE standard also used by [`f32::min`].
+For example, `_mm_min_ps`[^1] can be slightly faster than [`SimdFloat::simd_min`](`num::SimdFloat::simd_min`), but does not conform to the IEEE standard also used by [`f32::min`].
 When necessary, [`Simd<T, N>`] can be converted to the types provided by `std::arch` to make use of target-specific functions.
 
 Many targets simply don't have SIMD, or don't support SIMD for a particular element type.

From 08069a52d18185f95df12c5bd50513f25e0929f0 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Mon, 4 Dec 2023 08:16:03 +0100
Subject: [PATCH 06/74] portable-simd: add missing feature gate

---
 crates/core_simd/tests/pointers.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index a90ff928cedcb..b9f32d16e01d1 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -1,4 +1,4 @@
-#![feature(portable_simd, strict_provenance)]
+#![feature(portable_simd, strict_provenance, exposed_provenance)]
 
 use core_simd::simd::{
     ptr::{SimdConstPtr, SimdMutPtr},

From 98fc771e5b4d568de6a91985553e7b8b85a580c3 Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Tue, 12 Dec 2023 23:26:45 +0100
Subject: [PATCH 07/74] Fix target_feature config in portable-simd

---
 crates/core_simd/src/swizzle_dyn.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index bd8a38e350d3b..dac013cc98dc7 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -55,7 +55,7 @@ where
                 16 => transize(vqtbl1q_u8, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize_raw(avx2_pshufb, self, idxs),
-                #[cfg(target_feature = "avx512vl,avx512vbmi")]
+                #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
                 32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
                 // Notable absence: avx512bw shuffle
                 // If avx512bw is available, odds of avx512vbmi are good

From 7e65a05aa03a7393c52d8bea5014d269f6d147e1 Mon Sep 17 00:00:00 2001
From: Amanieu d'Antras <amanieu@gmail.com>
Date: Tue, 30 Jan 2024 03:40:53 +0000
Subject: [PATCH 08/74] Disable conversions between portable_simd and stdarch
 on big-endian ARM

stdarch no longer provide SIMD on big-endian ARM due to
https://github.com/rust-lang/stdarch/issues/1484
---
 crates/core_simd/src/vendor/arm.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/vendor/arm.rs b/crates/core_simd/src/vendor/arm.rs
index ff3b69ccf9592..ee5c642137367 100644
--- a/crates/core_simd/src/vendor/arm.rs
+++ b/crates/core_simd/src/vendor/arm.rs
@@ -7,9 +7,12 @@ use core::arch::arm::*;
 #[cfg(target_arch = "aarch64")]
 use core::arch::aarch64::*;
 
-#[cfg(any(
-    target_arch = "aarch64",
-    all(target_arch = "arm", target_feature = "v7"),
+#[cfg(all(
+    any(
+        target_arch = "aarch64",
+        all(target_arch = "arm", target_feature = "v7"),
+    ),
+    target_endian = "little"
 ))]
 mod neon {
     use super::*;

From f6996612ce2db070ae2a9370c71ac725d08b03e3 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sun, 18 Feb 2024 10:14:03 -0500
Subject: [PATCH 09/74] Merge commit '649110751ef4f27440d7cc711b3e07d11bf02d4a'
 into sync-portable-simd-2024-02-18

---
 crates/core_simd/src/intrinsics.rs         | 169 ---------------------
 crates/core_simd/src/lib.rs                |  22 ++-
 crates/core_simd/src/masks.rs              |  41 +++--
 crates/core_simd/src/masks/bitmask.rs      |   9 +-
 crates/core_simd/src/masks/full_masks.rs   |  27 ++--
 crates/core_simd/src/mod.rs                |   4 -
 crates/core_simd/src/ops.rs                |   6 +-
 crates/core_simd/src/ops/unary.rs          |   3 +-
 crates/core_simd/src/select.rs             |   3 +-
 crates/core_simd/src/simd/cmp/eq.rs        |   9 +-
 crates/core_simd/src/simd/cmp/ord.rs       |  25 ++-
 crates/core_simd/src/simd/num/float.rs     |  20 +--
 crates/core_simd/src/simd/num/int.rs       |  26 ++--
 crates/core_simd/src/simd/num/uint.rs      |  30 ++--
 crates/core_simd/src/simd/ptr/const_ptr.rs |  14 +-
 crates/core_simd/src/simd/ptr/mut_ptr.rs   |  14 +-
 crates/core_simd/src/swizzle.rs            |   5 +-
 crates/core_simd/src/swizzle_dyn.rs        |  44 ++----
 crates/core_simd/src/vector.rs             |  15 +-
 crates/core_simd/tests/masks.rs            |  42 +++++
 crates/std_float/src/lib.rs                |  38 ++---
 crates/test_helpers/src/lib.rs             |   6 +-
 22 files changed, 218 insertions(+), 354 deletions(-)
 delete mode 100644 crates/core_simd/src/intrinsics.rs

diff --git a/crates/core_simd/src/intrinsics.rs b/crates/core_simd/src/intrinsics.rs
deleted file mode 100644
index b27893bc7294d..0000000000000
--- a/crates/core_simd/src/intrinsics.rs
+++ /dev/null
@@ -1,169 +0,0 @@
-//! This module contains the LLVM intrinsics bindings that provide the functionality for this
-//! crate.
-//!
-//! The LLVM assembly language is documented here: <https://llvm.org/docs/LangRef.html>
-//!
-//! A quick glossary of jargon that may appear in this module, mostly paraphrasing LLVM's LangRef:
-//! - poison: "undefined behavior as a value". specifically, it is like uninit memory (such as padding bytes). it is "safe" to create poison, BUT
-//!   poison MUST NOT be observed from safe code, as operations on poison return poison, like NaN. unlike NaN, which has defined comparisons,
-//!   poison is neither true nor false, and LLVM may also convert it to undef (at which point it is both). so, it can't be conditioned on, either.
-//! - undef: "a value that is every value". functionally like poison, insofar as Rust is concerned. poison may become this. note:
-//!   this means that division by poison or undef is like division by zero, which means it inflicts...
-//! - "UB": poison and undef cover most of what people call "UB". "UB" means this operation immediately invalidates the program:
-//!   LLVM is allowed to lower it to `ud2` or other opcodes that may cause an illegal instruction exception, and this is the "good end".
-//!   The "bad end" is that LLVM may reverse time to the moment control flow diverged on a path towards undefined behavior,
-//!   and destroy the other branch, potentially deleting safe code and violating Rust's `unsafe` contract.
-//!
-//! Note that according to LLVM, vectors are not arrays, but they are equivalent when stored to and loaded from memory.
-//!
-//! Unless stated otherwise, all intrinsics for binary operations require SIMD vectors of equal types and lengths.
-
-// These intrinsics aren't linked directly from LLVM and are mostly undocumented, however they are
-// mostly lowered to the matching LLVM instructions by the compiler in a fairly straightforward manner.
-// The associated LLVM instruction or intrinsic is documented alongside each Rust intrinsic function.
-extern "platform-intrinsic" {
-    /// add/fadd
-    pub(crate) fn simd_add<T>(x: T, y: T) -> T;
-
-    /// sub/fsub
-    pub(crate) fn simd_sub<T>(lhs: T, rhs: T) -> T;
-
-    /// mul/fmul
-    pub(crate) fn simd_mul<T>(x: T, y: T) -> T;
-
-    /// udiv/sdiv/fdiv
-    /// ints and uints: {s,u}div incur UB if division by zero occurs.
-    /// ints: sdiv is UB for int::MIN / -1.
-    /// floats: fdiv is never UB, but may create NaNs or infinities.
-    pub(crate) fn simd_div<T>(lhs: T, rhs: T) -> T;
-
-    /// urem/srem/frem
-    /// ints and uints: {s,u}rem incur UB if division by zero occurs.
-    /// ints: srem is UB for int::MIN / -1.
-    /// floats: frem is equivalent to libm::fmod in the "default" floating point environment, sans errno.
-    pub(crate) fn simd_rem<T>(lhs: T, rhs: T) -> T;
-
-    /// shl
-    /// for (u)ints. poison if rhs >= lhs::BITS
-    pub(crate) fn simd_shl<T>(lhs: T, rhs: T) -> T;
-
-    /// ints: ashr
-    /// uints: lshr
-    /// poison if rhs >= lhs::BITS
-    pub(crate) fn simd_shr<T>(lhs: T, rhs: T) -> T;
-
-    /// and
-    pub(crate) fn simd_and<T>(x: T, y: T) -> T;
-
-    /// or
-    pub(crate) fn simd_or<T>(x: T, y: T) -> T;
-
-    /// xor
-    pub(crate) fn simd_xor<T>(x: T, y: T) -> T;
-
-    /// fptoui/fptosi/uitofp/sitofp
-    /// casting floats to integers is truncating, so it is safe to convert values like e.g. 1.5
-    /// but the truncated value must fit in the target type or the result is poison.
-    /// use `simd_as` instead for a cast that performs a saturating conversion.
-    pub(crate) fn simd_cast<T, U>(x: T) -> U;
-    /// follows Rust's `T as U` semantics, including saturating float casts
-    /// which amounts to the same as `simd_cast` for many cases
-    pub(crate) fn simd_as<T, U>(x: T) -> U;
-
-    /// neg/fneg
-    /// ints: ultimately becomes a call to cg_ssa's BuilderMethods::neg. cg_llvm equates this to `simd_sub(Simd::splat(0), x)`.
-    /// floats: LLVM's fneg, which changes the floating point sign bit. Some arches have instructions for it.
-    /// Rust panics for Neg::neg(int::MIN) due to overflow, but it is not UB in LLVM without `nsw`.
-    pub(crate) fn simd_neg<T>(x: T) -> T;
-
-    /// fabs
-    pub(crate) fn simd_fabs<T>(x: T) -> T;
-
-    // minnum/maxnum
-    pub(crate) fn simd_fmin<T>(x: T, y: T) -> T;
-    pub(crate) fn simd_fmax<T>(x: T, y: T) -> T;
-
-    // these return Simd<int, N> with the same BITS size as the inputs
-    pub(crate) fn simd_eq<T, U>(x: T, y: T) -> U;
-    pub(crate) fn simd_ne<T, U>(x: T, y: T) -> U;
-    pub(crate) fn simd_lt<T, U>(x: T, y: T) -> U;
-    pub(crate) fn simd_le<T, U>(x: T, y: T) -> U;
-    pub(crate) fn simd_gt<T, U>(x: T, y: T) -> U;
-    pub(crate) fn simd_ge<T, U>(x: T, y: T) -> U;
-
-    // shufflevector
-    // idx: LLVM calls it a "shuffle mask vector constant", a vector of i32s
-    pub(crate) fn simd_shuffle<T, U, V>(x: T, y: T, idx: U) -> V;
-
-    /// llvm.masked.gather
-    /// like a loop of pointer reads
-    /// val: vector of values to select if a lane is masked
-    /// ptr: vector of pointers to read from
-    /// mask: a "wide" mask of integers, selects as if simd_select(mask, read(ptr), val)
-    /// note, the LLVM intrinsic accepts a mask vector of `<N x i1>`
-    /// FIXME: review this if/when we fix up our mask story in general?
-    pub(crate) fn simd_gather<T, U, V>(val: T, ptr: U, mask: V) -> T;
-    /// llvm.masked.scatter
-    /// like gather, but more spicy, as it writes instead of reads
-    pub(crate) fn simd_scatter<T, U, V>(val: T, ptr: U, mask: V);
-
-    // {s,u}add.sat
-    pub(crate) fn simd_saturating_add<T>(x: T, y: T) -> T;
-
-    // {s,u}sub.sat
-    pub(crate) fn simd_saturating_sub<T>(lhs: T, rhs: T) -> T;
-
-    // reductions
-    // llvm.vector.reduce.{add,fadd}
-    pub(crate) fn simd_reduce_add_ordered<T, U>(x: T, y: U) -> U;
-    // llvm.vector.reduce.{mul,fmul}
-    pub(crate) fn simd_reduce_mul_ordered<T, U>(x: T, y: U) -> U;
-    #[allow(unused)]
-    pub(crate) fn simd_reduce_all<T>(x: T) -> bool;
-    #[allow(unused)]
-    pub(crate) fn simd_reduce_any<T>(x: T) -> bool;
-    pub(crate) fn simd_reduce_max<T, U>(x: T) -> U;
-    pub(crate) fn simd_reduce_min<T, U>(x: T) -> U;
-    pub(crate) fn simd_reduce_and<T, U>(x: T) -> U;
-    pub(crate) fn simd_reduce_or<T, U>(x: T) -> U;
-    pub(crate) fn simd_reduce_xor<T, U>(x: T) -> U;
-
-    // truncate integer vector to bitmask
-    // `fn simd_bitmask(vector) -> unsigned integer` takes a vector of integers and
-    // returns either an unsigned integer or array of `u8`.
-    // Every element in the vector becomes a single bit in the returned bitmask.
-    // If the vector has less than 8 lanes, a u8 is returned with zeroed trailing bits.
-    // The bit order of the result depends on the byte endianness. LSB-first for little
-    // endian and MSB-first for big endian.
-    //
-    // UB if called on a vector with values other than 0 and -1.
-    #[allow(unused)]
-    pub(crate) fn simd_bitmask<T, U>(x: T) -> U;
-
-    // select
-    // first argument is a vector of integers, -1 (all bits 1) is "true"
-    // logically equivalent to (yes & m) | (no & (m^-1),
-    // but you can use it on floats.
-    pub(crate) fn simd_select<M, T>(m: M, yes: T, no: T) -> T;
-    #[allow(unused)]
-    pub(crate) fn simd_select_bitmask<M, T>(m: M, yes: T, no: T) -> T;
-
-    /// getelementptr (without inbounds)
-    /// equivalent to wrapping_offset
-    pub(crate) fn simd_arith_offset<T, U>(ptr: T, offset: U) -> T;
-
-    /// equivalent to `T as U` semantics, specifically for pointers
-    pub(crate) fn simd_cast_ptr<T, U>(ptr: T) -> U;
-
-    /// expose a pointer as an address
-    pub(crate) fn simd_expose_addr<T, U>(ptr: T) -> U;
-
-    /// convert an exposed address back to a pointer
-    pub(crate) fn simd_from_exposed_addr<T, U>(addr: T) -> U;
-
-    // Integer operations
-    pub(crate) fn simd_bswap<T>(x: T) -> T;
-    pub(crate) fn simd_bitreverse<T>(x: T) -> T;
-    pub(crate) fn simd_ctlz<T>(x: T) -> T;
-    pub(crate) fn simd_cttz<T>(x: T) -> T;
-}
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 64ba9705ef523..a25723e11cef9 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,20 +1,38 @@
 #![no_std]
 #![feature(
+    const_intrinsic_copy,
     const_refs_to_cell,
     const_maybe_uninit_as_mut_ptr,
     const_mut_refs,
     convert_float_to_int,
+    core_intrinsics,
     decl_macro,
     inline_const,
     intra_doc_pointers,
-    platform_intrinsics,
     repr_simd,
     simd_ffi,
     staged_api,
-    stdsimd,
     strict_provenance,
     ptr_metadata
 )]
+#![cfg_attr(
+    all(
+        any(target_arch = "aarch64", target_arch = "arm",),
+        any(
+            all(target_feature = "v6", not(target_feature = "mclass")),
+            all(target_feature = "mclass", target_feature = "dsp"),
+        )
+    ),
+    feature(stdarch_arm_dsp)
+)]
+#![cfg_attr(
+    all(target_arch = "arm", target_feature = "v7"),
+    feature(stdarch_arm_neon_intrinsics)
+)]
+#![cfg_attr(
+    any(target_arch = "powerpc", target_arch = "powerpc64"),
+    feature(stdarch_powerpc)
+)]
 #![warn(missing_docs, clippy::missing_inline_in_public_items)] // basically all items, really
 #![deny(unsafe_op_in_unsafe_fn, clippy::undocumented_unsafe_blocks)]
 #![allow(internal_features)]
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index 0623d2bf3d121..e480c25a51e6f 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -12,9 +12,7 @@
 )]
 mod mask_impl;
 
-use crate::simd::{
-    cmp::SimdPartialEq, intrinsics, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount,
-};
+use crate::simd::{LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
 use core::cmp::Ordering;
 use core::{fmt, mem};
 
@@ -35,7 +33,7 @@ mod sealed {
 
         fn eq(self, other: Self) -> bool;
 
-        fn as_usize(self) -> usize;
+        fn to_usize(self) -> usize;
 
         type Unsigned: SimdElement;
 
@@ -60,14 +58,23 @@ macro_rules! impl_element {
             where
                 LaneCount<N>: SupportedLaneCount,
             {
-                (value.simd_eq(Simd::splat(0 as _)) | value.simd_eq(Simd::splat(-1 as _))).all()
+                // We can't use `Simd` directly, because `Simd`'s functions call this function and
+                // we will end up with an infinite loop.
+                // Safety: `value` is an integer vector
+                unsafe {
+                    use core::intrinsics::simd;
+                    let falses: Simd<Self, N> = simd::simd_eq(value, Simd::splat(0 as _));
+                    let trues: Simd<Self, N> = simd::simd_eq(value, Simd::splat(-1 as _));
+                    let valid: Simd<Self, N> = simd::simd_or(falses, trues);
+                    simd::simd_reduce_all(valid)
+                }
             }
 
             #[inline]
             fn eq(self, other: Self) -> bool { self == other }
 
             #[inline]
-            fn as_usize(self) -> usize {
+            fn to_usize(self) -> usize {
                 self as usize
             }
 
@@ -141,8 +148,9 @@ where
         // but these are "dependently-sized" types, so copy elision it is!
         unsafe {
             let bytes: [u8; N] = mem::transmute_copy(&array);
-            let bools: Simd<i8, N> = intrinsics::simd_ne(Simd::from_array(bytes), Simd::splat(0u8));
-            Mask::from_int_unchecked(intrinsics::simd_cast(bools))
+            let bools: Simd<i8, N> =
+                core::intrinsics::simd::simd_ne(Simd::from_array(bytes), Simd::splat(0u8));
+            Mask::from_int_unchecked(core::intrinsics::simd::simd_cast(bools))
         }
     }
 
@@ -160,7 +168,7 @@ where
         // This would be hypothetically valid as an "in-place" transmute,
         // but these are "dependently-sized" types, so copy elision it is!
         unsafe {
-            let mut bytes: Simd<i8, N> = intrinsics::simd_cast(self.to_int());
+            let mut bytes: Simd<i8, N> = core::intrinsics::simd::simd_cast(self.to_int());
             bytes &= Simd::splat(1i8);
             mem::transmute_copy(&bytes)
         }
@@ -175,7 +183,10 @@ where
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
         // Safety: the caller must confirm this invariant
-        unsafe { Self(mask_impl::Mask::from_int_unchecked(value)) }
+        unsafe {
+            core::intrinsics::assume(<T as Sealed>::valid(value));
+            Self(mask_impl::Mask::from_int_unchecked(value))
+        }
     }
 
     /// Converts a vector of integers to a mask, where 0 represents `false` and -1
@@ -374,15 +385,17 @@ where
         );
 
         // Safety: the input and output are integer vectors
-        let index: Simd<T, N> = unsafe { intrinsics::simd_cast(index) };
+        let index: Simd<T, N> = unsafe { core::intrinsics::simd::simd_cast(index) };
 
         let masked_index = self.select(index, Self::splat(true).to_int());
 
         // Safety: the input and output are integer vectors
-        let masked_index: Simd<T::Unsigned, N> = unsafe { intrinsics::simd_cast(masked_index) };
+        let masked_index: Simd<T::Unsigned, N> =
+            unsafe { core::intrinsics::simd::simd_cast(masked_index) };
 
         // Safety: the input is an integer vector
-        let min_index: T::Unsigned = unsafe { intrinsics::simd_reduce_min(masked_index) };
+        let min_index: T::Unsigned =
+            unsafe { core::intrinsics::simd::simd_reduce_min(masked_index) };
 
         // Safety: the return value is the unsigned version of T
         let min_index: T = unsafe { core::mem::transmute_copy(&min_index) };
@@ -390,7 +403,7 @@ where
         if min_index.eq(T::TRUE) {
             None
         } else {
-            Some(min_index.as_usize())
+            Some(min_index.to_usize())
         }
     }
 }
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 6ddff07fea254..96c553426ee74 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -1,6 +1,5 @@
 #![allow(unused_imports)]
 use super::MaskElement;
-use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SupportedLaneCount};
 use core::marker::PhantomData;
 
@@ -109,14 +108,18 @@ where
     #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn to_int(self) -> Simd<T, N> {
         unsafe {
-            intrinsics::simd_select_bitmask(self.0, Simd::splat(T::TRUE), Simd::splat(T::FALSE))
+            core::intrinsics::simd::simd_select_bitmask(
+                self.0,
+                Simd::splat(T::TRUE),
+                Simd::splat(T::FALSE),
+            )
         }
     }
 
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original value"]
     pub unsafe fn from_int_unchecked(value: Simd<T, N>) -> Self {
-        unsafe { Self(intrinsics::simd_bitmask(value), PhantomData) }
+        unsafe { Self(core::intrinsics::simd::simd_bitmask(value), PhantomData) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 63964f455e05c..87f031a9f367a 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -1,6 +1,5 @@
 //! Masks that take up full SIMD vector registers.
 
-use crate::simd::intrinsics;
 use crate::simd::{LaneCount, MaskElement, Simd, SupportedLaneCount};
 
 #[repr(transparent)]
@@ -138,7 +137,7 @@ where
         U: MaskElement,
     {
         // Safety: masks are simply integer vectors of 0 and -1, and we can cast the element type.
-        unsafe { Mask(intrinsics::simd_cast(self.0)) }
+        unsafe { Mask(core::intrinsics::simd::simd_cast(self.0)) }
     }
 
     #[inline]
@@ -150,13 +149,16 @@ where
         unsafe {
             // Compute the bitmask
             let mut bytes: <LaneCount<N> as SupportedLaneCount>::BitMask =
-                intrinsics::simd_bitmask(self.0);
+                core::intrinsics::simd::simd_bitmask(self.0);
 
             // LLVM assumes bit order should match endianness
             if cfg!(target_endian = "big") {
                 for x in bytes.as_mut() {
                     *x = x.reverse_bits()
                 }
+                if N % 8 > 0 {
+                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
+                }
             }
 
             bitmask.as_mut_array()[..bytes.as_ref().len()].copy_from_slice(bytes.as_ref());
@@ -180,10 +182,13 @@ where
                 for x in bytes.as_mut() {
                     *x = x.reverse_bits();
                 }
+                if N % 8 > 0 {
+                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
+                }
             }
 
             // Compute the regular mask
-            Self::from_int_unchecked(intrinsics::simd_select_bitmask(
+            Self::from_int_unchecked(core::intrinsics::simd::simd_select_bitmask(
                 bytes,
                 Self::splat(true).to_int(),
                 Self::splat(false).to_int(),
@@ -199,7 +204,7 @@ where
         let resized = self.to_int().resize::<M>(T::FALSE);
 
         // Safety: `resized` is an integer vector with length M, which must match T
-        let bitmask: U = unsafe { intrinsics::simd_bitmask(resized) };
+        let bitmask: U = unsafe { core::intrinsics::simd::simd_bitmask(resized) };
 
         // LLVM assumes bit order should match endianness
         if cfg!(target_endian = "big") {
@@ -223,7 +228,7 @@ where
 
         // SAFETY: `mask` is the correct bitmask type for a u64 bitmask
         let mask: Simd<T, M> = unsafe {
-            intrinsics::simd_select_bitmask(
+            core::intrinsics::simd::simd_select_bitmask(
                 bitmask,
                 Simd::<T, M>::splat(T::TRUE),
                 Simd::<T, M>::splat(T::FALSE),
@@ -274,14 +279,14 @@ where
     #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn any(self) -> bool {
         // Safety: use `self` as an integer vector
-        unsafe { intrinsics::simd_reduce_any(self.to_int()) }
+        unsafe { core::intrinsics::simd::simd_reduce_any(self.to_int()) }
     }
 
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
     pub fn all(self) -> bool {
         // Safety: use `self` as an integer vector
-        unsafe { intrinsics::simd_reduce_all(self.to_int()) }
+        unsafe { core::intrinsics::simd::simd_reduce_all(self.to_int()) }
     }
 }
 
@@ -306,7 +311,7 @@ where
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitand(self, rhs: Self) -> Self {
         // Safety: `self` is an integer vector
-        unsafe { Self(intrinsics::simd_and(self.0, rhs.0)) }
+        unsafe { Self(core::intrinsics::simd::simd_and(self.0, rhs.0)) }
     }
 }
 
@@ -320,7 +325,7 @@ where
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitor(self, rhs: Self) -> Self {
         // Safety: `self` is an integer vector
-        unsafe { Self(intrinsics::simd_or(self.0, rhs.0)) }
+        unsafe { Self(core::intrinsics::simd::simd_or(self.0, rhs.0)) }
     }
 }
 
@@ -334,7 +339,7 @@ where
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn bitxor(self, rhs: Self) -> Self {
         // Safety: `self` is an integer vector
-        unsafe { Self(intrinsics::simd_xor(self.0, rhs.0)) }
+        unsafe { Self(core::intrinsics::simd::simd_xor(self.0, rhs.0)) }
     }
 }
 
diff --git a/crates/core_simd/src/mod.rs b/crates/core_simd/src/mod.rs
index fd016f1c6f7ab..45b1a0f975141 100644
--- a/crates/core_simd/src/mod.rs
+++ b/crates/core_simd/src/mod.rs
@@ -1,8 +1,6 @@
 #[macro_use]
 mod swizzle;
 
-pub(crate) mod intrinsics;
-
 mod alias;
 mod cast;
 mod fmt;
@@ -27,8 +25,6 @@ pub mod simd {
 
     pub mod cmp;
 
-    pub(crate) use crate::core_simd::intrinsics;
-
     pub use crate::core_simd::alias::*;
     pub use crate::core_simd::cast::*;
     pub use crate::core_simd::lane_count::{LaneCount, SupportedLaneCount};
diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index 8a1b083f03982..d8e10eeaa1a2a 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -37,7 +37,7 @@ where
 macro_rules! unsafe_base {
     ($lhs:ident, $rhs:ident, {$simd_call:ident}, $($_:tt)*) => {
         // Safety: $lhs and $rhs are vectors
-        unsafe { $crate::simd::intrinsics::$simd_call($lhs, $rhs) }
+        unsafe { core::intrinsics::simd::$simd_call($lhs, $rhs) }
     };
 }
 
@@ -55,7 +55,7 @@ macro_rules! wrap_bitshift {
         #[allow(clippy::suspicious_arithmetic_impl)]
         // Safety: $lhs and the bitand result are vectors
         unsafe {
-            $crate::simd::intrinsics::$simd_call(
+            core::intrinsics::simd::$simd_call(
                 $lhs,
                 $rhs.bitand(Simd::splat(<$int>::BITS as $int - 1)),
             )
@@ -97,7 +97,7 @@ macro_rules! int_divrem_guard {
                 $rhs
             };
             // Safety: $lhs and rhs are vectors
-            unsafe { $crate::simd::intrinsics::$simd_call($lhs, rhs) }
+            unsafe { core::intrinsics::simd::$simd_call($lhs, rhs) }
         }
     };
 }
diff --git a/crates/core_simd/src/ops/unary.rs b/crates/core_simd/src/ops/unary.rs
index a651aa73e9524..bdae96332a3ae 100644
--- a/crates/core_simd/src/ops/unary.rs
+++ b/crates/core_simd/src/ops/unary.rs
@@ -1,4 +1,3 @@
-use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Simd, SimdElement, SupportedLaneCount};
 use core::ops::{Neg, Not}; // unary ops
 
@@ -15,7 +14,7 @@ macro_rules! neg {
             #[must_use = "operator returns a new vector without mutating the input"]
             fn neg(self) -> Self::Output {
                 // Safety: `self` is a signed vector
-                unsafe { intrinsics::simd_neg(self) }
+                unsafe { core::intrinsics::simd::simd_neg(self) }
             }
         })*
     }
diff --git a/crates/core_simd/src/select.rs b/crates/core_simd/src/select.rs
index cdcf8eeec815a..f33aa261a928f 100644
--- a/crates/core_simd/src/select.rs
+++ b/crates/core_simd/src/select.rs
@@ -1,4 +1,3 @@
-use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Mask, MaskElement, Simd, SimdElement, SupportedLaneCount};
 
 impl<T, const N: usize> Mask<T, N>
@@ -29,7 +28,7 @@ where
     {
         // Safety: The mask has been cast to a vector of integers,
         // and the operands to select between are vectors of the same type and length.
-        unsafe { intrinsics::simd_select(self.to_int(), true_values, false_values) }
+        unsafe { core::intrinsics::simd::simd_select(self.to_int(), true_values, false_values) }
     }
 
     /// Choose elements from two masks.
diff --git a/crates/core_simd/src/simd/cmp/eq.rs b/crates/core_simd/src/simd/cmp/eq.rs
index f132fa2cc0cae..5b4615ce51d79 100644
--- a/crates/core_simd/src/simd/cmp/eq.rs
+++ b/crates/core_simd/src/simd/cmp/eq.rs
@@ -1,5 +1,4 @@
 use crate::simd::{
-    intrinsics,
     ptr::{SimdConstPtr, SimdMutPtr},
     LaneCount, Mask, Simd, SimdElement, SupportedLaneCount,
 };
@@ -31,14 +30,14 @@ macro_rules! impl_number {
             fn simd_eq(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_eq(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_eq(self, other)) }
             }
 
             #[inline]
             fn simd_ne(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_ne(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_ne(self, other)) }
             }
         }
         )*
@@ -60,14 +59,14 @@ macro_rules! impl_mask {
             fn simd_eq(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Self::from_int_unchecked(intrinsics::simd_eq(self.to_int(), other.to_int())) }
+                unsafe { Self::from_int_unchecked(core::intrinsics::simd::simd_eq(self.to_int(), other.to_int())) }
             }
 
             #[inline]
             fn simd_ne(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Self::from_int_unchecked(intrinsics::simd_ne(self.to_int(), other.to_int())) }
+                unsafe { Self::from_int_unchecked(core::intrinsics::simd::simd_ne(self.to_int(), other.to_int())) }
             }
         }
         )*
diff --git a/crates/core_simd/src/simd/cmp/ord.rs b/crates/core_simd/src/simd/cmp/ord.rs
index 4e9d49ea2211a..899f00a831641 100644
--- a/crates/core_simd/src/simd/cmp/ord.rs
+++ b/crates/core_simd/src/simd/cmp/ord.rs
@@ -1,6 +1,5 @@
 use crate::simd::{
     cmp::SimdPartialEq,
-    intrinsics,
     ptr::{SimdConstPtr, SimdMutPtr},
     LaneCount, Mask, Simd, SupportedLaneCount,
 };
@@ -57,28 +56,28 @@ macro_rules! impl_integer {
             fn simd_lt(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_lt(self, other)) }
             }
 
             #[inline]
             fn simd_le(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_le(self, other)) }
             }
 
             #[inline]
             fn simd_gt(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_gt(self, other)) }
             }
 
             #[inline]
             fn simd_ge(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_ge(self, other)) }
             }
         }
 
@@ -123,28 +122,28 @@ macro_rules! impl_float {
             fn simd_lt(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_lt(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_lt(self, other)) }
             }
 
             #[inline]
             fn simd_le(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_le(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_le(self, other)) }
             }
 
             #[inline]
             fn simd_gt(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_gt(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_gt(self, other)) }
             }
 
             #[inline]
             fn simd_ge(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Mask::from_int_unchecked(intrinsics::simd_ge(self, other)) }
+                unsafe { Mask::from_int_unchecked(core::intrinsics::simd::simd_ge(self, other)) }
             }
         }
         )*
@@ -164,28 +163,28 @@ macro_rules! impl_mask {
             fn simd_lt(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Self::from_int_unchecked(intrinsics::simd_lt(self.to_int(), other.to_int())) }
+                unsafe { Self::from_int_unchecked(core::intrinsics::simd::simd_lt(self.to_int(), other.to_int())) }
             }
 
             #[inline]
             fn simd_le(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Self::from_int_unchecked(intrinsics::simd_le(self.to_int(), other.to_int())) }
+                unsafe { Self::from_int_unchecked(core::intrinsics::simd::simd_le(self.to_int(), other.to_int())) }
             }
 
             #[inline]
             fn simd_gt(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Self::from_int_unchecked(intrinsics::simd_gt(self.to_int(), other.to_int())) }
+                unsafe { Self::from_int_unchecked(core::intrinsics::simd::simd_gt(self.to_int(), other.to_int())) }
             }
 
             #[inline]
             fn simd_ge(self, other: Self) -> Self::Mask {
                 // Safety: `self` is a vector, and the result of the comparison
                 // is always a valid mask.
-                unsafe { Self::from_int_unchecked(intrinsics::simd_ge(self.to_int(), other.to_int())) }
+                unsafe { Self::from_int_unchecked(core::intrinsics::simd::simd_ge(self.to_int(), other.to_int())) }
             }
         }
 
diff --git a/crates/core_simd/src/simd/num/float.rs b/crates/core_simd/src/simd/num/float.rs
index fc0b99e87a68e..59e43851ea8da 100644
--- a/crates/core_simd/src/simd/num/float.rs
+++ b/crates/core_simd/src/simd/num/float.rs
@@ -1,7 +1,7 @@
 use super::sealed::Sealed;
 use crate::simd::{
     cmp::{SimdPartialEq, SimdPartialOrd},
-    intrinsics, LaneCount, Mask, Simd, SimdCast, SimdElement, SupportedLaneCount,
+    LaneCount, Mask, Simd, SimdCast, SimdElement, SupportedLaneCount,
 };
 
 /// Operations on SIMD vectors of floats.
@@ -259,7 +259,7 @@ macro_rules! impl_trait {
             fn cast<T: SimdCast>(self) -> Self::Cast<T>
             {
                 // Safety: supported types are guaranteed by SimdCast
-                unsafe { intrinsics::simd_as(self) }
+                unsafe { core::intrinsics::simd::simd_as(self) }
             }
 
             #[inline]
@@ -269,7 +269,7 @@ macro_rules! impl_trait {
                 Self::Scalar: core::convert::FloatToInt<I>,
             {
                 // Safety: supported types are guaranteed by SimdCast, the caller is responsible for the extra invariants
-                unsafe { intrinsics::simd_cast(self) }
+                unsafe { core::intrinsics::simd::simd_cast(self) }
             }
 
             #[inline]
@@ -289,7 +289,7 @@ macro_rules! impl_trait {
             #[inline]
             fn abs(self) -> Self {
                 // Safety: `self` is a float vector
-                unsafe { intrinsics::simd_fabs(self) }
+                unsafe { core::intrinsics::simd::simd_fabs(self) }
             }
 
             #[inline]
@@ -363,13 +363,13 @@ macro_rules! impl_trait {
             #[inline]
             fn simd_min(self, other: Self) -> Self {
                 // Safety: `self` and `other` are float vectors
-                unsafe { intrinsics::simd_fmin(self, other) }
+                unsafe { core::intrinsics::simd::simd_fmin(self, other) }
             }
 
             #[inline]
             fn simd_max(self, other: Self) -> Self {
                 // Safety: `self` and `other` are floating point vectors
-                unsafe { intrinsics::simd_fmax(self, other) }
+                unsafe { core::intrinsics::simd::simd_fmax(self, other) }
             }
 
             #[inline]
@@ -391,7 +391,7 @@ macro_rules! impl_trait {
                     self.as_array().iter().sum()
                 } else {
                     // Safety: `self` is a float vector
-                    unsafe { intrinsics::simd_reduce_add_ordered(self, 0.) }
+                    unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, 0.) }
                 }
             }
 
@@ -402,20 +402,20 @@ macro_rules! impl_trait {
                     self.as_array().iter().product()
                 } else {
                     // Safety: `self` is a float vector
-                    unsafe { intrinsics::simd_reduce_mul_ordered(self, 1.) }
+                    unsafe { core::intrinsics::simd::simd_reduce_mul_ordered(self, 1.) }
                 }
             }
 
             #[inline]
             fn reduce_max(self) -> Self::Scalar {
                 // Safety: `self` is a float vector
-                unsafe { intrinsics::simd_reduce_max(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_max(self) }
             }
 
             #[inline]
             fn reduce_min(self) -> Self::Scalar {
                 // Safety: `self` is a float vector
-                unsafe { intrinsics::simd_reduce_min(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_min(self) }
             }
         }
         )*
diff --git a/crates/core_simd/src/simd/num/int.rs b/crates/core_simd/src/simd/num/int.rs
index 1f1aa27278299..d7598d9ceaf92 100644
--- a/crates/core_simd/src/simd/num/int.rs
+++ b/crates/core_simd/src/simd/num/int.rs
@@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    cmp::SimdPartialOrd, intrinsics, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
+    cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
     SupportedLaneCount,
 };
 
@@ -237,19 +237,19 @@ macro_rules! impl_trait {
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T> {
                 // Safety: supported types are guaranteed by SimdCast
-                unsafe { intrinsics::simd_as(self) }
+                unsafe { core::intrinsics::simd::simd_as(self) }
             }
 
             #[inline]
             fn saturating_add(self, second: Self) -> Self {
                 // Safety: `self` is a vector
-                unsafe { intrinsics::simd_saturating_add(self, second) }
+                unsafe { core::intrinsics::simd::simd_saturating_add(self, second) }
             }
 
             #[inline]
             fn saturating_sub(self, second: Self) -> Self {
                 // Safety: `self` is a vector
-                unsafe { intrinsics::simd_saturating_sub(self, second) }
+                unsafe { core::intrinsics::simd::simd_saturating_sub(self, second) }
             }
 
             #[inline]
@@ -293,55 +293,55 @@ macro_rules! impl_trait {
             #[inline]
             fn reduce_sum(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_add_ordered(self, 0) }
+                unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, 0) }
             }
 
             #[inline]
             fn reduce_product(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_mul_ordered(self, 1) }
+                unsafe { core::intrinsics::simd::simd_reduce_mul_ordered(self, 1) }
             }
 
             #[inline]
             fn reduce_max(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_max(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_max(self) }
             }
 
             #[inline]
             fn reduce_min(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_min(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_min(self) }
             }
 
             #[inline]
             fn reduce_and(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_and(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_and(self) }
             }
 
             #[inline]
             fn reduce_or(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_or(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_or(self) }
             }
 
             #[inline]
             fn reduce_xor(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_xor(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_xor(self) }
             }
 
             #[inline]
             fn swap_bytes(self) -> Self {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_bswap(self) }
+                unsafe { core::intrinsics::simd::simd_bswap(self) }
             }
 
             #[inline]
             fn reverse_bits(self) -> Self {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_bitreverse(self) }
+                unsafe { core::intrinsics::simd::simd_bitreverse(self) }
             }
 
             #[inline]
diff --git a/crates/core_simd/src/simd/num/uint.rs b/crates/core_simd/src/simd/num/uint.rs
index c955ee8fe8bd1..53dd97f501c63 100644
--- a/crates/core_simd/src/simd/num/uint.rs
+++ b/crates/core_simd/src/simd/num/uint.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{intrinsics, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
+use crate::simd::{LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
 
 /// Operations on SIMD vectors of unsigned integers.
 pub trait SimdUint: Copy + Sealed {
@@ -117,7 +117,7 @@ macro_rules! impl_trait {
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T> {
                 // Safety: supported types are guaranteed by SimdCast
-                unsafe { intrinsics::simd_as(self) }
+                unsafe { core::intrinsics::simd::simd_as(self) }
             }
 
             #[inline]
@@ -129,79 +129,79 @@ macro_rules! impl_trait {
             #[inline]
             fn saturating_add(self, second: Self) -> Self {
                 // Safety: `self` is a vector
-                unsafe { intrinsics::simd_saturating_add(self, second) }
+                unsafe { core::intrinsics::simd::simd_saturating_add(self, second) }
             }
 
             #[inline]
             fn saturating_sub(self, second: Self) -> Self {
                 // Safety: `self` is a vector
-                unsafe { intrinsics::simd_saturating_sub(self, second) }
+                unsafe { core::intrinsics::simd::simd_saturating_sub(self, second) }
             }
 
             #[inline]
             fn reduce_sum(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_add_ordered(self, 0) }
+                unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, 0) }
             }
 
             #[inline]
             fn reduce_product(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_mul_ordered(self, 1) }
+                unsafe { core::intrinsics::simd::simd_reduce_mul_ordered(self, 1) }
             }
 
             #[inline]
             fn reduce_max(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_max(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_max(self) }
             }
 
             #[inline]
             fn reduce_min(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_min(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_min(self) }
             }
 
             #[inline]
             fn reduce_and(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_and(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_and(self) }
             }
 
             #[inline]
             fn reduce_or(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_or(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_or(self) }
             }
 
             #[inline]
             fn reduce_xor(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_reduce_xor(self) }
+                unsafe { core::intrinsics::simd::simd_reduce_xor(self) }
             }
 
             #[inline]
             fn swap_bytes(self) -> Self {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_bswap(self) }
+                unsafe { core::intrinsics::simd::simd_bswap(self) }
             }
 
             #[inline]
             fn reverse_bits(self) -> Self {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_bitreverse(self) }
+                unsafe { core::intrinsics::simd::simd_bitreverse(self) }
             }
 
             #[inline]
             fn leading_zeros(self) -> Self {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_ctlz(self) }
+                unsafe { core::intrinsics::simd::simd_ctlz(self) }
             }
 
             #[inline]
             fn trailing_zeros(self) -> Self {
                 // Safety: `self` is an integer vector
-                unsafe { intrinsics::simd_cttz(self) }
+                unsafe { core::intrinsics::simd::simd_cttz(self) }
             }
 
             #[inline]
diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index 97fe3fb600dfa..e217d1c8c87ca 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -1,7 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{
-    cmp::SimdPartialEq, intrinsics, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount,
-};
+use crate::simd::{cmp::SimdPartialEq, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount};
 
 /// Operations on SIMD vectors of constant pointers.
 pub trait SimdConstPtr: Copy + Sealed {
@@ -103,13 +101,13 @@ where
         assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
 
         // Safety: pointers can be cast
-        unsafe { intrinsics::simd_cast_ptr(self) }
+        unsafe { core::intrinsics::simd::simd_cast_ptr(self) }
     }
 
     #[inline]
     fn cast_mut(self) -> Self::MutPtr {
         // Safety: pointers can be cast
-        unsafe { intrinsics::simd_cast_ptr(self) }
+        unsafe { core::intrinsics::simd::simd_cast_ptr(self) }
     }
 
     #[inline]
@@ -135,19 +133,19 @@ where
     #[inline]
     fn expose_addr(self) -> Self::Usize {
         // Safety: `self` is a pointer vector
-        unsafe { intrinsics::simd_expose_addr(self) }
+        unsafe { core::intrinsics::simd::simd_expose_addr(self) }
     }
 
     #[inline]
     fn from_exposed_addr(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
-        unsafe { intrinsics::simd_from_exposed_addr(addr) }
+        unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
     }
 
     #[inline]
     fn wrapping_offset(self, count: Self::Isize) -> Self {
         // Safety: simd_arith_offset takes a vector of pointers and a vector of offsets
-        unsafe { intrinsics::simd_arith_offset(self, count) }
+        unsafe { core::intrinsics::simd::simd_arith_offset(self, count) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index e35633d0433dd..5cb27af4fdeba 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -1,7 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{
-    cmp::SimdPartialEq, intrinsics, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount,
-};
+use crate::simd::{cmp::SimdPartialEq, num::SimdUint, LaneCount, Mask, Simd, SupportedLaneCount};
 
 /// Operations on SIMD vectors of mutable pointers.
 pub trait SimdMutPtr: Copy + Sealed {
@@ -100,13 +98,13 @@ where
         assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
 
         // Safety: pointers can be cast
-        unsafe { intrinsics::simd_cast_ptr(self) }
+        unsafe { core::intrinsics::simd::simd_cast_ptr(self) }
     }
 
     #[inline]
     fn cast_const(self) -> Self::ConstPtr {
         // Safety: pointers can be cast
-        unsafe { intrinsics::simd_cast_ptr(self) }
+        unsafe { core::intrinsics::simd::simd_cast_ptr(self) }
     }
 
     #[inline]
@@ -132,19 +130,19 @@ where
     #[inline]
     fn expose_addr(self) -> Self::Usize {
         // Safety: `self` is a pointer vector
-        unsafe { intrinsics::simd_expose_addr(self) }
+        unsafe { core::intrinsics::simd::simd_expose_addr(self) }
     }
 
     #[inline]
     fn from_exposed_addr(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
-        unsafe { intrinsics::simd_from_exposed_addr(addr) }
+        unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
     }
 
     #[inline]
     fn wrapping_offset(self, count: Self::Isize) -> Self {
         // Safety: simd_arith_offset takes a vector of pointers and a vector of offsets
-        unsafe { intrinsics::simd_arith_offset(self, count) }
+        unsafe { core::intrinsics::simd::simd_arith_offset(self, count) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index ec8548d557458..71110bb282018 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -1,4 +1,3 @@
-use crate::simd::intrinsics;
 use crate::simd::{LaneCount, Mask, MaskElement, Simd, SimdElement, SupportedLaneCount};
 
 /// Constructs a new SIMD vector by copying elements from selected elements in other vectors.
@@ -88,7 +87,7 @@ pub trait Swizzle<const N: usize> {
     {
         // Safety: `vector` is a vector, and the index is a const array of u32.
         unsafe {
-            intrinsics::simd_shuffle(
+            core::intrinsics::simd::simd_shuffle(
                 vector,
                 vector,
                 const {
@@ -124,7 +123,7 @@ pub trait Swizzle<const N: usize> {
     {
         // Safety: `first` and `second` are vectors, and the index is a const array of u32.
         unsafe {
-            intrinsics::simd_shuffle(
+            core::intrinsics::simd::simd_shuffle(
                 first,
                 second,
                 const {
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index dac013cc98dc7..ae9ff6894b0a7 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -44,7 +44,7 @@ where
                 ))]
                 8 => transize(vtbl1_u8, self, idxs),
                 #[cfg(target_feature = "ssse3")]
-                16 => transize(x86::_mm_shuffle_epi8, self, idxs),
+                16 => transize(x86::_mm_shuffle_epi8, self, zeroing_idxs(idxs)),
                 #[cfg(target_feature = "simd128")]
                 16 => transize(wasm::i8x16_swizzle, self, idxs),
                 #[cfg(all(
@@ -54,9 +54,9 @@ where
                 ))]
                 16 => transize(vqtbl1q_u8, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
-                32 => transize_raw(avx2_pshufb, self, idxs),
+                32 => transize(avx2_pshufb, self, idxs),
                 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
-                32 => transize(x86::_mm256_permutexvar_epi8, self, idxs),
+                32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self),
                 // Notable absence: avx512bw shuffle
                 // If avx512bw is available, odds of avx512vbmi are good
                 // FIXME: initial AVX512VBMI variant didn't actually pass muster
@@ -129,45 +129,25 @@ unsafe fn avx2_pshufb(bytes: Simd<u8, 32>, idxs: Simd<u8, 32>) -> Simd<u8, 32> {
 #[inline(always)]
 unsafe fn transize<T, const N: usize>(
     f: unsafe fn(T, T) -> T,
-    bytes: Simd<u8, N>,
-    idxs: Simd<u8, N>,
+    a: Simd<u8, N>,
+    b: Simd<u8, N>,
 ) -> Simd<u8, N>
 where
     LaneCount<N>: SupportedLaneCount,
 {
-    let idxs = zeroing_idxs(idxs);
     // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
-    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
+    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&a), mem::transmute_copy(&b))) }
 }
 
-/// Make indices that yield 0 for this architecture
+/// Make indices that yield 0 for x86
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[allow(unused)]
 #[inline(always)]
 fn zeroing_idxs<const N: usize>(idxs: Simd<u8, N>) -> Simd<u8, N>
 where
     LaneCount<N>: SupportedLaneCount,
 {
-    // On x86, make sure the top bit is set.
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    let idxs = {
-        use crate::simd::cmp::SimdPartialOrd;
-        idxs.simd_lt(Simd::splat(N as u8))
-            .select(idxs, Simd::splat(u8::MAX))
-    };
-    // Simply do nothing on most architectures.
-    idxs
-}
-
-/// As transize but no implicit call to `zeroing_idxs`.
-#[allow(dead_code)]
-#[inline(always)]
-unsafe fn transize_raw<T, const N: usize>(
-    f: unsafe fn(T, T) -> T,
-    bytes: Simd<u8, N>,
-    idxs: Simd<u8, N>,
-) -> Simd<u8, N>
-where
-    LaneCount<N>: SupportedLaneCount,
-{
-    // SAFETY: Same obligation to use this function as to use mem::transmute_copy.
-    unsafe { mem::transmute_copy(&f(mem::transmute_copy(&bytes), mem::transmute_copy(&idxs))) }
+    use crate::simd::cmp::SimdPartialOrd;
+    idxs.simd_lt(Simd::splat(N as u8))
+        .select(idxs, Simd::splat(u8::MAX))
 }
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 105c06741c58b..9e97a3161bb2b 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,6 +1,5 @@
 use crate::simd::{
     cmp::SimdPartialOrd,
-    intrinsics,
     ptr::{SimdConstPtr, SimdMutPtr},
     LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle,
 };
@@ -194,7 +193,7 @@ where
     /// With padding, `read_unaligned` will read past the end of an array of N elements.
     ///
     /// # Safety
-    /// Reading `ptr` must be safe, as if by `<*const [T; N]>::read_unaligned`.
+    /// Reading `ptr` must be safe, as if by `<*const [T; N]>::read`.
     #[inline]
     const unsafe fn load(ptr: *const [T; N]) -> Self {
         // There are potentially simpler ways to write this function, but this should result in
@@ -215,7 +214,7 @@ where
     /// See `load` as to why this function is necessary.
     ///
     /// # Safety
-    /// Writing to `ptr` must be safe, as if by `<*mut [T; N]>::write_unaligned`.
+    /// Writing to `ptr` must be safe, as if by `<*mut [T; N]>::write`.
     #[inline]
     const unsafe fn store(self, ptr: *mut [T; N]) {
         // There are potentially simpler ways to write this function, but this should result in
@@ -491,7 +490,7 @@ where
         or: Self,
     ) -> Self {
         // Safety: The caller is responsible for upholding all invariants
-        unsafe { intrinsics::simd_gather(or, source, enable.to_int()) }
+        unsafe { core::intrinsics::simd::simd_gather(or, source, enable.to_int()) }
     }
 
     /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`.
@@ -650,7 +649,7 @@ where
     #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
     pub unsafe fn scatter_select_ptr(self, dest: Simd<*mut T, N>, enable: Mask<isize, N>) {
         // Safety: The caller is responsible for upholding all invariants
-        unsafe { intrinsics::simd_scatter(self, dest, enable.to_int()) }
+        unsafe { core::intrinsics::simd::simd_scatter(self, dest, enable.to_int()) }
     }
 }
 
@@ -692,7 +691,8 @@ where
     fn eq(&self, other: &Self) -> bool {
         // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
         let mask = unsafe {
-            let tfvec: Simd<<T as SimdElement>::Mask, N> = intrinsics::simd_eq(*self, *other);
+            let tfvec: Simd<<T as SimdElement>::Mask, N> =
+                core::intrinsics::simd::simd_eq(*self, *other);
             Mask::from_int_unchecked(tfvec)
         };
 
@@ -705,7 +705,8 @@ where
     fn ne(&self, other: &Self) -> bool {
         // Safety: All SIMD vectors are SimdPartialEq, and the comparison produces a valid mask.
         let mask = unsafe {
-            let tfvec: Simd<<T as SimdElement>::Mask, N> = intrinsics::simd_ne(*self, *other);
+            let tfvec: Simd<<T as SimdElement>::Mask, N> =
+                core::intrinsics::simd::simd_ne(*self, *other);
             Mask::from_int_unchecked(tfvec)
         };
 
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 00fc2a24e27a6..fc6a3476b7c60 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -99,6 +99,19 @@ macro_rules! test_mask_api {
                 assert_eq!(Mask::<$type, 2>::from_bitmask(bitmask), mask);
             }
 
+            #[cfg(feature = "all_lane_counts")]
+            #[test]
+            fn roundtrip_bitmask_conversion_odd() {
+                let values = [
+                    true, false, true, false, true, true, false, false, false, true, true,
+                ];
+                let mask = Mask::<$type, 11>::from_array(values);
+                let bitmask = mask.to_bitmask();
+                assert_eq!(bitmask, 0b11000110101);
+                assert_eq!(Mask::<$type, 11>::from_bitmask(bitmask), mask);
+            }
+
+
             #[test]
             fn cast() {
                 fn cast_impl<T: core_simd::simd::MaskElement>()
@@ -134,6 +147,35 @@ macro_rules! test_mask_api {
                 assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b01001001, 0b10000011]);
                 assert_eq!(Mask::<$type, 16>::from_bitmask_vector(bitmask), mask);
             }
+
+            // rust-lang/portable-simd#379
+            #[test]
+            fn roundtrip_bitmask_vector_conversion_small() {
+                use core_simd::simd::ToBytes;
+                let values = [
+                    true, false, true, true
+                ];
+                let mask = Mask::<$type, 4>::from_array(values);
+                let bitmask = mask.to_bitmask_vector();
+                assert_eq!(bitmask.resize::<1>(0).to_ne_bytes()[0], 0b00001101);
+                assert_eq!(Mask::<$type, 4>::from_bitmask_vector(bitmask), mask);
+            }
+
+            /* FIXME doesn't work with non-powers-of-two, yet
+            // rust-lang/portable-simd#379
+            #[cfg(feature = "all_lane_counts")]
+            #[test]
+            fn roundtrip_bitmask_vector_conversion_odd() {
+                use core_simd::simd::ToBytes;
+                let values = [
+                    true, false, true, false, true, true, false, false, false, true, true,
+                ];
+                let mask = Mask::<$type, 11>::from_array(values);
+                let bitmask = mask.to_bitmask_vector();
+                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b00110101, 0b00000110]);
+                assert_eq!(Mask::<$type, 11>::from_bitmask_vector(bitmask), mask);
+            }
+            */
         }
     }
 }
diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
index 1fef17242ca85..4c547777fdeb3 100644
--- a/crates/std_float/src/lib.rs
+++ b/crates/std_float/src/lib.rs
@@ -1,7 +1,7 @@
 #![cfg_attr(feature = "as_crate", no_std)] // We are std!
 #![cfg_attr(
     feature = "as_crate",
-    feature(platform_intrinsics),
+    feature(core_intrinsics),
     feature(portable_simd),
     allow(internal_features)
 )]
@@ -10,6 +10,8 @@ use core::simd;
 #[cfg(feature = "as_crate")]
 use core_simd::simd;
 
+use core::intrinsics::simd as intrinsics;
+
 use simd::{LaneCount, Simd, SupportedLaneCount};
 
 #[cfg(feature = "as_crate")]
@@ -22,28 +24,6 @@ use experimental as sealed;
 
 use crate::sealed::Sealed;
 
-// "platform intrinsics" are essentially "codegen intrinsics"
-// each of these may be scalarized and lowered to a libm call
-extern "platform-intrinsic" {
-    // ceil
-    fn simd_ceil<T>(x: T) -> T;
-
-    // floor
-    fn simd_floor<T>(x: T) -> T;
-
-    // round
-    fn simd_round<T>(x: T) -> T;
-
-    // trunc
-    fn simd_trunc<T>(x: T) -> T;
-
-    // fsqrt
-    fn simd_fsqrt<T>(x: T) -> T;
-
-    // fma
-    fn simd_fma<T>(x: T, y: T, z: T) -> T;
-}
-
 /// This trait provides a possibly-temporary implementation of float functions
 /// that may, in the absence of hardware support, canonicalize to calling an
 /// operating system's `math.h` dynamically-loaded library (also known as a
@@ -74,7 +54,7 @@ pub trait StdFloat: Sealed + Sized {
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn mul_add(self, a: Self, b: Self) -> Self {
-        unsafe { simd_fma(self, a, b) }
+        unsafe { intrinsics::simd_fma(self, a, b) }
     }
 
     /// Produces a vector where every lane has the square root value
@@ -82,35 +62,35 @@ pub trait StdFloat: Sealed + Sized {
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn sqrt(self) -> Self {
-        unsafe { simd_fsqrt(self) }
+        unsafe { intrinsics::simd_fsqrt(self) }
     }
 
     /// Returns the smallest integer greater than or equal to each lane.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     #[inline]
     fn ceil(self) -> Self {
-        unsafe { simd_ceil(self) }
+        unsafe { intrinsics::simd_ceil(self) }
     }
 
     /// Returns the largest integer value less than or equal to each lane.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     #[inline]
     fn floor(self) -> Self {
-        unsafe { simd_floor(self) }
+        unsafe { intrinsics::simd_floor(self) }
     }
 
     /// Rounds to the nearest integer value. Ties round toward zero.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     #[inline]
     fn round(self) -> Self {
-        unsafe { simd_round(self) }
+        unsafe { intrinsics::simd_round(self) }
     }
 
     /// Returns the floating point's integer value, with its fractional part removed.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     #[inline]
     fn trunc(self) -> Self {
-        unsafe { simd_trunc(self) }
+        unsafe { intrinsics::simd_trunc(self) }
     }
 
     /// Returns the floating point's fractional value, with its integer part removed.
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index b80c745aaf2fb..51b860a863560 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -1,4 +1,8 @@
-#![feature(stdsimd, powerpc_target_feature)]
+#![feature(powerpc_target_feature)]
+#![cfg_attr(
+    any(target_arch = "powerpc", target_arch = "powerpc64"),
+    feature(stdarch_powerpc)
+)]
 
 pub mod array;
 

From 8d2208ea3df77d09e4bfb8a28e131cc06bc6183a Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Fri, 22 Mar 2024 16:58:39 -0700
Subject: [PATCH 10/74] Merge commit 'cff979eec1ac0473fc4960ee6cde462c6aeda824'
 into sync-portable-simd-2024-03-22

---
 .github/workflows/ci.yml                    |   5 +
 Cargo.lock                                  |   3 +
 crates/core_simd/src/lib.rs                 |  12 +-
 crates/core_simd/src/masks.rs               |   6 +
 crates/core_simd/src/swizzle_dyn.rs         |   8 +-
 crates/core_simd/src/vector.rs              | 244 ++++++++++++++++++++
 crates/core_simd/src/vendor.rs              |   2 +-
 crates/core_simd/src/vendor/arm.rs          |   8 +-
 crates/core_simd/tests/masked_load_store.rs |  35 +++
 crates/core_simd/tests/swizzle_dyn.rs       |   2 +-
 crates/std_float/Cargo.toml                 |   7 +
 crates/std_float/src/lib.rs                 | 151 ++++++++----
 crates/std_float/tests/float.rs             |  74 ++++++
 13 files changed, 505 insertions(+), 52 deletions(-)
 create mode 100644 crates/core_simd/tests/masked_load_store.rs
 create mode 100644 crates/std_float/tests/float.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 90543044ea845..b292be2d6f999 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -141,6 +141,11 @@ jobs:
       - name: Test (release)
         run: cargo test --verbose --target=${{ matrix.target }} --release
 
+      - name: Generate docs
+        run: cargo doc --verbose --target=${{ matrix.target }}
+        env:
+          RUSTDOCFLAGS: -Dwarnings
+
   wasm-tests:
     name: "wasm (firefox, ${{ matrix.name }})"
     runs-on: ubuntu-latest
diff --git a/Cargo.lock b/Cargo.lock
index 46312c09657d5..1584c704fb221 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -177,6 +177,9 @@ name = "std_float"
 version = "0.1.0"
 dependencies = [
  "core_simd",
+ "test_helpers",
+ "wasm-bindgen",
+ "wasm-bindgen-test",
 ]
 
 [[package]]
diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index a25723e11cef9..7a161b7e01d25 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -13,11 +13,12 @@
     simd_ffi,
     staged_api,
     strict_provenance,
+    prelude_import,
     ptr_metadata
 )]
 #![cfg_attr(
     all(
-        any(target_arch = "aarch64", target_arch = "arm",),
+        any(target_arch = "aarch64", target_arch = "arm64ec", target_arch = "arm",),
         any(
             all(target_feature = "v6", not(target_feature = "mclass")),
             all(target_feature = "mclass", target_feature = "dsp"),
@@ -33,12 +34,21 @@
     any(target_arch = "powerpc", target_arch = "powerpc64"),
     feature(stdarch_powerpc)
 )]
+#![cfg_attr(
+    all(target_arch = "x86_64", target_feature = "avx512f"),
+    feature(stdarch_x86_avx512)
+)]
 #![warn(missing_docs, clippy::missing_inline_in_public_items)] // basically all items, really
 #![deny(unsafe_op_in_unsafe_fn, clippy::undocumented_unsafe_blocks)]
+#![doc(test(attr(deny(warnings))))]
 #![allow(internal_features)]
 #![unstable(feature = "portable_simd", issue = "86656")]
 //! Portable SIMD module.
 
+#[prelude_import]
+#[allow(unused_imports)]
+use core::prelude::v1::*;
+
 #[path = "mod.rs"]
 mod core_simd;
 pub use self::core_simd::simd;
diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index e480c25a51e6f..e6e27c76a5e99 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -34,6 +34,7 @@ mod sealed {
         fn eq(self, other: Self) -> bool;
 
         fn to_usize(self) -> usize;
+        fn max_unsigned() -> u64;
 
         type Unsigned: SimdElement;
 
@@ -78,6 +79,11 @@ macro_rules! impl_element {
                 self as usize
             }
 
+            #[inline]
+            fn max_unsigned() -> u64 {
+                <$unsigned>::MAX as u64
+            }
+
             type Unsigned = $unsigned;
 
             const TRUE: Self = -1;
diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index ae9ff6894b0a7..8a1079042f076 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -16,7 +16,10 @@ where
     #[inline]
     pub fn swizzle_dyn(self, idxs: Simd<u8, N>) -> Self {
         #![allow(unused_imports, unused_unsafe)]
-        #[cfg(all(target_arch = "aarch64", target_endian = "little"))]
+        #[cfg(all(
+            any(target_arch = "aarch64", target_arch = "arm64ec"),
+            target_endian = "little"
+        ))]
         use core::arch::aarch64::{uint8x8_t, vqtbl1q_u8, vtbl1_u8};
         #[cfg(all(
             target_arch = "arm",
@@ -37,6 +40,7 @@ where
                 #[cfg(all(
                     any(
                         target_arch = "aarch64",
+                        target_arch = "arm64ec",
                         all(target_arch = "arm", target_feature = "v7")
                     ),
                     target_feature = "neon",
@@ -48,7 +52,7 @@ where
                 #[cfg(target_feature = "simd128")]
                 16 => transize(wasm::i8x16_swizzle, self, idxs),
                 #[cfg(all(
-                    target_arch = "aarch64",
+                    any(target_arch = "aarch64", target_arch = "arm64ec"),
                     target_feature = "neon",
                     target_endian = "little"
                 ))]
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 9e97a3161bb2b..6c8205b112c31 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -1,5 +1,6 @@
 use crate::simd::{
     cmp::SimdPartialOrd,
+    num::SimdUint,
     ptr::{SimdConstPtr, SimdMutPtr},
     LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle,
 };
@@ -262,6 +263,7 @@ where
     /// # Panics
     ///
     /// Panics if the slice's length is less than the vector's `Simd::N`.
+    /// Use `load_or_default` for an alternative that does not panic.
     ///
     /// # Example
     ///
@@ -315,6 +317,143 @@ where
         unsafe { self.store(slice.as_mut_ptr().cast()) }
     }
 
+    /// Reads contiguous elements from `slice`. Elements are read so long as they're in-bounds for
+    /// the `slice`. Otherwise, the default value for the element type is returned.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::Simd;
+    /// let vec: Vec<i32> = vec![10, 11];
+    ///
+    /// let result = Simd::<i32, 4>::load_or_default(&vec);
+    /// assert_eq!(result, Simd::from_array([10, 11, 0, 0]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn load_or_default(slice: &[T]) -> Self
+    where
+        T: Default,
+    {
+        Self::load_or(slice, Default::default())
+    }
+
+    /// Reads contiguous elements from `slice`. Elements are read so long as they're in-bounds for
+    /// the `slice`. Otherwise, the corresponding value from `or` is passed through.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::Simd;
+    /// let vec: Vec<i32> = vec![10, 11];
+    /// let or = Simd::from_array([-5, -4, -3, -2]);
+    ///
+    /// let result = Simd::load_or(&vec, or);
+    /// assert_eq!(result, Simd::from_array([10, 11, -3, -2]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn load_or(slice: &[T], or: Self) -> Self {
+        Self::load_select(slice, Mask::splat(true), or)
+    }
+
+    /// Reads contiguous elements from `slice`. Each element is read from memory if its
+    /// corresponding element in `enable` is `true`.
+    ///
+    /// When the element is disabled or out of bounds for the slice, that memory location
+    /// is not accessed and the corresponding value from `or` is passed through.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, Mask};
+    /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let enable = Mask::from_array([true, true, false, true]);
+    /// let or = Simd::from_array([-5, -4, -3, -2]);
+    ///
+    /// let result = Simd::load_select(&vec, enable, or);
+    /// assert_eq!(result, Simd::from_array([10, 11, -3, 13]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn load_select_or_default(slice: &[T], enable: Mask<<T as SimdElement>::Mask, N>) -> Self
+    where
+        T: Default,
+    {
+        Self::load_select(slice, enable, Default::default())
+    }
+
+    /// Reads contiguous elements from `slice`. Each element is read from memory if its
+    /// corresponding element in `enable` is `true`.
+    ///
+    /// When the element is disabled or out of bounds for the slice, that memory location
+    /// is not accessed and the corresponding value from `or` is passed through.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, Mask};
+    /// let vec: Vec<i32> = vec![10, 11, 12, 13, 14, 15, 16, 17, 18];
+    /// let enable = Mask::from_array([true, true, false, true]);
+    /// let or = Simd::from_array([-5, -4, -3, -2]);
+    ///
+    /// let result = Simd::load_select(&vec, enable, or);
+    /// assert_eq!(result, Simd::from_array([10, 11, -3, 13]));
+    /// ```
+    #[must_use]
+    #[inline]
+    pub fn load_select(
+        slice: &[T],
+        mut enable: Mask<<T as SimdElement>::Mask, N>,
+        or: Self,
+    ) -> Self {
+        enable &= mask_up_to(slice.len());
+        // SAFETY: We performed the bounds check by updating the mask. &[T] is properly aligned to
+        // the element.
+        unsafe { Self::load_select_ptr(slice.as_ptr(), enable, or) }
+    }
+
+    /// Reads contiguous elements from `slice`. Each element is read from memory if its
+    /// corresponding element in `enable` is `true`.
+    ///
+    /// When the element is disabled, that memory location is not accessed and the corresponding
+    /// value from `or` is passed through.
+    #[must_use]
+    #[inline]
+    pub unsafe fn load_select_unchecked(
+        slice: &[T],
+        enable: Mask<<T as SimdElement>::Mask, N>,
+        or: Self,
+    ) -> Self {
+        let ptr = slice.as_ptr();
+        // SAFETY: The safety of reading elements from `slice` is ensured by the caller.
+        unsafe { Self::load_select_ptr(ptr, enable, or) }
+    }
+
+    /// Reads contiguous elements starting at `ptr`. Each element is read from memory if its
+    /// corresponding element in `enable` is `true`.
+    ///
+    /// When the element is disabled, that memory location is not accessed and the corresponding
+    /// value from `or` is passed through.
+    #[must_use]
+    #[inline]
+    pub unsafe fn load_select_ptr(
+        ptr: *const T,
+        enable: Mask<<T as SimdElement>::Mask, N>,
+        or: Self,
+    ) -> Self {
+        // SAFETY: The safety of reading elements through `ptr` is ensured by the caller.
+        unsafe { core::intrinsics::simd::simd_masked_load(enable.to_int(), ptr, or) }
+    }
+
     /// Reads from potentially discontiguous indices in `slice` to construct a SIMD vector.
     /// If an index is out-of-bounds, the element is instead selected from the `or` vector.
     ///
@@ -493,6 +632,77 @@ where
         unsafe { core::intrinsics::simd::simd_gather(or, source, enable.to_int()) }
     }
 
+    /// Conditionally write contiguous elements to `slice`. The `enable` mask controls
+    /// which elements are written, as long as they're in-bounds of the `slice`.
+    /// If the element is disabled or out of bounds, no memory access to that location
+    /// is made.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, Mask};
+    /// let mut arr = [0i32; 4];
+    /// let write = Simd::from_array([-5, -4, -3, -2]);
+    /// let enable = Mask::from_array([false, true, true, true]);
+    ///
+    /// write.store_select(&mut arr[..3], enable);
+    /// assert_eq!(arr, [0, -4, -3, 0]);
+    /// ```
+    #[inline]
+    pub fn store_select(self, slice: &mut [T], mut enable: Mask<<T as SimdElement>::Mask, N>) {
+        enable &= mask_up_to(slice.len());
+        // SAFETY: We performed the bounds check by updating the mask. &[T] is properly aligned to
+        // the element.
+        unsafe { self.store_select_ptr(slice.as_mut_ptr(), enable) }
+    }
+
+    /// Conditionally write contiguous elements to `slice`. The `enable` mask controls
+    /// which elements are written.
+    ///
+    /// # Safety
+    ///
+    /// Every enabled element must be in bounds for the `slice`.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::{Simd, Mask};
+    /// let mut arr = [0i32; 4];
+    /// let write = Simd::from_array([-5, -4, -3, -2]);
+    /// let enable = Mask::from_array([false, true, true, true]);
+    ///
+    /// unsafe { write.store_select_unchecked(&mut arr, enable) };
+    /// assert_eq!(arr, [0, -4, -3, -2]);
+    /// ```
+    #[inline]
+    pub unsafe fn store_select_unchecked(
+        self,
+        slice: &mut [T],
+        enable: Mask<<T as SimdElement>::Mask, N>,
+    ) {
+        let ptr = slice.as_mut_ptr();
+        // SAFETY: The safety of writing elements in `slice` is ensured by the caller.
+        unsafe { self.store_select_ptr(ptr, enable) }
+    }
+
+    /// Conditionally write contiguous elements starting from `ptr`.
+    /// The `enable` mask controls which elements are written.
+    /// When disabled, the memory location corresponding to that element is not accessed.
+    ///
+    /// # Safety
+    ///
+    /// Memory addresses for element are calculated [`pointer::wrapping_offset`] and
+    /// each enabled element must satisfy the same conditions as [`core::ptr::write`].
+    #[inline]
+    pub unsafe fn store_select_ptr(self, ptr: *mut T, enable: Mask<<T as SimdElement>::Mask, N>) {
+        // SAFETY: The safety of writing elements through `ptr` is ensured by the caller.
+        unsafe { core::intrinsics::simd::simd_masked_store(enable.to_int(), ptr, self) }
+    }
+
     /// Writes the values in a SIMD vector to potentially discontiguous indices in `slice`.
     /// If an index is out-of-bounds, the write is suppressed without panicking.
     /// If two elements in the scattered vector would write to the same index
@@ -980,3 +1190,37 @@ where
 {
     type Mask = isize;
 }
+
+#[inline]
+fn lane_indices<const N: usize>() -> Simd<usize, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+{
+    let mut index = [0; N];
+    for i in 0..N {
+        index[i] = i;
+    }
+    Simd::from_array(index)
+}
+
+#[inline]
+fn mask_up_to<M, const N: usize>(len: usize) -> Mask<M, N>
+where
+    LaneCount<N>: SupportedLaneCount,
+    M: MaskElement,
+{
+    let index = lane_indices::<N>();
+    let max_value: u64 = M::max_unsigned();
+    macro_rules! case {
+        ($ty:ty) => {
+            if N < <$ty>::MAX as usize && max_value as $ty as u64 == max_value {
+                return index.cast().simd_lt(Simd::splat(len.min(N) as $ty)).cast();
+            }
+        };
+    }
+    case!(u8);
+    case!(u16);
+    case!(u32);
+    case!(u64);
+    index.simd_lt(Simd::splat(len)).cast()
+}
diff --git a/crates/core_simd/src/vendor.rs b/crates/core_simd/src/vendor.rs
index 6223bedb4e13c..1a34a3a8de5c4 100644
--- a/crates/core_simd/src/vendor.rs
+++ b/crates/core_simd/src/vendor.rs
@@ -24,7 +24,7 @@ mod x86;
 #[cfg(target_arch = "wasm32")]
 mod wasm32;
 
-#[cfg(any(target_arch = "aarch64", target_arch = "arm",))]
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec", target_arch = "arm",))]
 mod arm;
 
 #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
diff --git a/crates/core_simd/src/vendor/arm.rs b/crates/core_simd/src/vendor/arm.rs
index ee5c642137367..f8878d11f094d 100644
--- a/crates/core_simd/src/vendor/arm.rs
+++ b/crates/core_simd/src/vendor/arm.rs
@@ -4,12 +4,13 @@ use crate::simd::*;
 #[cfg(target_arch = "arm")]
 use core::arch::arm::*;
 
-#[cfg(target_arch = "aarch64")]
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
 use core::arch::aarch64::*;
 
 #[cfg(all(
     any(
         target_arch = "aarch64",
+        target_arch = "arm64ec",
         all(target_arch = "arm", target_feature = "v7"),
     ),
     target_endian = "little"
@@ -69,7 +70,10 @@ mod simd32 {
     from_transmute! { unsafe Simd<i8, 4> => int8x4_t }
 }
 
-#[cfg(target_arch = "aarch64")]
+#[cfg(all(
+    any(target_arch = "aarch64", target_arch = "arm64ec"),
+    target_endian = "little"
+))]
 mod aarch64 {
     use super::neon::*;
     use super::*;
diff --git a/crates/core_simd/tests/masked_load_store.rs b/crates/core_simd/tests/masked_load_store.rs
new file mode 100644
index 0000000000000..3d38658e945f7
--- /dev/null
+++ b/crates/core_simd/tests/masked_load_store.rs
@@ -0,0 +1,35 @@
+#![feature(portable_simd)]
+use core_simd::simd::prelude::*;
+
+#[cfg(target_arch = "wasm32")]
+use wasm_bindgen_test::*;
+
+#[cfg(target_arch = "wasm32")]
+wasm_bindgen_test_configure!(run_in_browser);
+
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn masked_load_store() {
+    let mut arr = [u8::MAX; 7];
+
+    u8x4::splat(0).store_select(&mut arr[5..], Mask::from_array([false, true, false, true]));
+    // write to index 8 is OOB and dropped
+    assert_eq!(arr, [255u8, 255, 255, 255, 255, 255, 0]);
+
+    u8x4::from_array([0, 1, 2, 3]).store_select(&mut arr[1..], Mask::splat(true));
+    assert_eq!(arr, [255u8, 0, 1, 2, 3, 255, 0]);
+
+    // read from index 8 is OOB and dropped
+    assert_eq!(
+        u8x4::load_or(&arr[4..], u8x4::splat(42)),
+        u8x4::from_array([3, 255, 0, 42])
+    );
+    assert_eq!(
+        u8x4::load_select(
+            &arr[4..],
+            Mask::from_array([true, false, true, true]),
+            u8x4::splat(42)
+        ),
+        u8x4::from_array([3, 42, 0, 42])
+    );
+}
diff --git a/crates/core_simd/tests/swizzle_dyn.rs b/crates/core_simd/tests/swizzle_dyn.rs
index f21a937f01c44..19ffe1417c8c4 100644
--- a/crates/core_simd/tests/swizzle_dyn.rs
+++ b/crates/core_simd/tests/swizzle_dyn.rs
@@ -1,6 +1,6 @@
 #![feature(portable_simd)]
 use core::{fmt, ops::RangeInclusive};
-use test_helpers::{self, biteq, make_runner, prop_assert_biteq};
+use test_helpers::{biteq, make_runner, prop_assert_biteq};
 
 fn swizzle_dyn_scalar_ver<const N: usize>(values: [u8; N], idxs: [u8; N]) -> [u8; N] {
     let mut array = [0; N];
diff --git a/crates/std_float/Cargo.toml b/crates/std_float/Cargo.toml
index 84c69774cbdfe..0896094ee63f4 100644
--- a/crates/std_float/Cargo.toml
+++ b/crates/std_float/Cargo.toml
@@ -8,6 +8,13 @@ edition = "2021"
 [dependencies]
 core_simd = { path = "../core_simd", default-features = false }
 
+[dev-dependencies.test_helpers]
+path = "../test_helpers"
+
+[target.'cfg(target_arch = "wasm32")'.dev-dependencies]
+wasm-bindgen = "0.2"
+wasm-bindgen-test = "0.3"
+
 [features]
 default = ["as_crate"]
 as_crate = []
diff --git a/crates/std_float/src/lib.rs b/crates/std_float/src/lib.rs
index 4c547777fdeb3..148aa5f9f1771 100644
--- a/crates/std_float/src/lib.rs
+++ b/crates/std_float/src/lib.rs
@@ -1,4 +1,3 @@
-#![cfg_attr(feature = "as_crate", no_std)] // We are std!
 #![cfg_attr(
     feature = "as_crate",
     feature(core_intrinsics),
@@ -44,7 +43,7 @@ use crate::sealed::Sealed;
 /// For now this trait is available to permit experimentation with SIMD float
 /// operations that may lack hardware support, such as `mul_add`.
 pub trait StdFloat: Sealed + Sized {
-    /// Fused multiply-add.  Computes `(self * a) + b` with only one rounding error,
+    /// Elementwise fused multiply-add. Computes `(self * a) + b` with only one rounding error,
     /// yielding a more accurate result than an unfused multiply-add.
     ///
     /// Using `mul_add` *may* be more performant than an unfused multiply-add if the target
@@ -57,22 +56,65 @@ pub trait StdFloat: Sealed + Sized {
         unsafe { intrinsics::simd_fma(self, a, b) }
     }
 
-    /// Produces a vector where every lane has the square root value
-    /// of the equivalently-indexed lane in `self`
+    /// Produces a vector where every element has the square root value
+    /// of the equivalently-indexed element in `self`
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original value"]
     fn sqrt(self) -> Self {
         unsafe { intrinsics::simd_fsqrt(self) }
     }
 
-    /// Returns the smallest integer greater than or equal to each lane.
+    /// Produces a vector where every element has the sine of the value
+    /// in the equivalently-indexed element in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn sin(self) -> Self;
+
+    /// Produces a vector where every element has the cosine of the value
+    /// in the equivalently-indexed element in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn cos(self) -> Self;
+
+    /// Produces a vector where every element has the exponential (base e) of the value
+    /// in the equivalently-indexed element in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn exp(self) -> Self;
+
+    /// Produces a vector where every element has the exponential (base 2) of the value
+    /// in the equivalently-indexed element in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn exp2(self) -> Self;
+
+    /// Produces a vector where every element has the natural logarithm of the value
+    /// in the equivalently-indexed element in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn ln(self) -> Self;
+
+    /// Produces a vector where every element has the logarithm with respect to an arbitrary
+    /// in the equivalently-indexed elements in `self` and `base`.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn log(self, base: Self) -> Self {
+        unsafe { intrinsics::simd_div(self.ln(), base.ln()) }
+    }
+
+    /// Produces a vector where every element has the base-2 logarithm of the value
+    /// in the equivalently-indexed element in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn log2(self) -> Self;
+
+    /// Produces a vector where every element has the base-10 logarithm of the value
+    /// in the equivalently-indexed element in `self`.
+    #[must_use = "method returns a new vector and does not mutate the original value"]
+    fn log10(self) -> Self;
+
+    /// Returns the smallest integer greater than or equal to each element.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     #[inline]
     fn ceil(self) -> Self {
         unsafe { intrinsics::simd_ceil(self) }
     }
 
-    /// Returns the largest integer value less than or equal to each lane.
+    /// Returns the largest integer value less than or equal to each element.
     #[must_use = "method returns a new vector and does not mutate the original value"]
     #[inline]
     fn floor(self) -> Self {
@@ -101,46 +143,65 @@ pub trait StdFloat: Sealed + Sized {
 impl<const N: usize> Sealed for Simd<f32, N> where LaneCount<N>: SupportedLaneCount {}
 impl<const N: usize> Sealed for Simd<f64, N> where LaneCount<N>: SupportedLaneCount {}
 
-// We can safely just use all the defaults.
-impl<const N: usize> StdFloat for Simd<f32, N>
-where
-    LaneCount<N>: SupportedLaneCount,
-{
-    /// Returns the floating point's fractional value, with its integer part removed.
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    #[inline]
-    fn fract(self) -> Self {
-        self - self.trunc()
-    }
-}
-
-impl<const N: usize> StdFloat for Simd<f64, N>
-where
-    LaneCount<N>: SupportedLaneCount,
-{
-    /// Returns the floating point's fractional value, with its integer part removed.
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    #[inline]
-    fn fract(self) -> Self {
-        self - self.trunc()
+macro_rules! impl_float {
+    {
+        $($fn:ident: $intrinsic:ident,)*
+    } => {
+        impl<const N: usize> StdFloat for Simd<f32, N>
+        where
+            LaneCount<N>: SupportedLaneCount,
+        {
+            #[inline]
+            fn fract(self) -> Self {
+                self - self.trunc()
+            }
+
+            $(
+            #[inline]
+            fn $fn(self) -> Self {
+                unsafe { intrinsics::$intrinsic(self) }
+            }
+            )*
+        }
+
+        impl<const N: usize> StdFloat for Simd<f64, N>
+        where
+            LaneCount<N>: SupportedLaneCount,
+        {
+            #[inline]
+            fn fract(self) -> Self {
+                self - self.trunc()
+            }
+
+            $(
+            #[inline]
+            fn $fn(self) -> Self {
+                // https://github.com/llvm/llvm-project/issues/83729
+                #[cfg(target_arch = "aarch64")]
+                {
+                    let mut ln = Self::splat(0f64);
+                    for i in 0..N {
+                        ln[i] = self[i].$fn()
+                    }
+                    ln
+                }
+
+                #[cfg(not(target_arch = "aarch64"))]
+                {
+                    unsafe { intrinsics::$intrinsic(self) }
+                }
+            }
+            )*
+        }
     }
 }
 
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use simd::prelude::*;
-
-    #[test]
-    fn everything_works() {
-        let x = f32x4::from_array([0.1, 0.5, 0.6, -1.5]);
-        let x2 = x + x;
-        let _xc = x.ceil();
-        let _xf = x.floor();
-        let _xr = x.round();
-        let _xt = x.trunc();
-        let _xfma = x.mul_add(x, x);
-        let _xsqrt = x.sqrt();
-        let _ = x2.abs() * x2;
-    }
+impl_float! {
+    sin: simd_fsin,
+    cos: simd_fcos,
+    exp: simd_fexp,
+    exp2: simd_fexp2,
+    ln: simd_flog,
+    log2: simd_flog2,
+    log10: simd_flog10,
 }
diff --git a/crates/std_float/tests/float.rs b/crates/std_float/tests/float.rs
new file mode 100644
index 0000000000000..c66c968f8c667
--- /dev/null
+++ b/crates/std_float/tests/float.rs
@@ -0,0 +1,74 @@
+#![feature(portable_simd)]
+
+macro_rules! unary_test {
+    { $scalar:tt, $($func:tt),+ } => {
+        test_helpers::test_lanes! {
+            $(
+            fn $func<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &core_simd::simd::Simd::<$scalar, LANES>::$func,
+                    &$scalar::$func,
+                    &|_| true,
+                )
+            }
+            )*
+        }
+    }
+}
+
+macro_rules! binary_test {
+    { $scalar:tt, $($func:tt),+ } => {
+        test_helpers::test_lanes! {
+            $(
+            fn $func<const LANES: usize>() {
+                test_helpers::test_binary_elementwise(
+                    &core_simd::simd::Simd::<$scalar, LANES>::$func,
+                    &$scalar::$func,
+                    &|_, _| true,
+                )
+            }
+            )*
+        }
+    }
+}
+
+macro_rules! ternary_test {
+    { $scalar:tt, $($func:tt),+ } => {
+        test_helpers::test_lanes! {
+            $(
+            fn $func<const LANES: usize>() {
+                test_helpers::test_ternary_elementwise(
+                    &core_simd::simd::Simd::<$scalar, LANES>::$func,
+                    &$scalar::$func,
+                    &|_, _, _| true,
+                )
+            }
+            )*
+        }
+    }
+}
+
+macro_rules! impl_tests {
+    { $scalar:tt } => {
+        mod $scalar {
+            use std_float::StdFloat;
+
+            unary_test! { $scalar, sqrt, sin, cos, exp, exp2, ln, log2, log10, ceil, floor, round, trunc }
+            binary_test! { $scalar, log }
+            ternary_test! { $scalar, mul_add }
+
+            test_helpers::test_lanes! {
+                fn fract<const LANES: usize>() {
+                    test_helpers::test_unary_elementwise_flush_subnormals(
+                        &core_simd::simd::Simd::<$scalar, LANES>::fract,
+                        &$scalar::fract,
+                        &|_| true,
+                    )
+                }
+            }
+        }
+    }
+}
+
+impl_tests! { f32 }
+impl_tests! { f64 }

From 6c39a26f3319c7e77f8ffcd94e6f36d0c28bf3a9 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 23 Mar 2024 11:52:33 +0100
Subject: [PATCH 11/74] add without_provenance to pointer types

---
 crates/core_simd/src/simd/ptr/const_ptr.rs | 21 +++++++++++++++++++++
 crates/core_simd/src/simd/ptr/mut_ptr.rs   | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index e217d1c8c87ca..2e7654347c0ae 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -42,6 +42,19 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::addr`] on each element.
     fn addr(self) -> Self::Usize;
 
+    /// Convert an address to a pointer without giving it any provenance.
+    ///
+    /// Without provenance, this pointer is not associated with any actual allocation. Such a
+    /// no-provenance pointer may be used for zero-sized memory accesses (if suitably aligned), but
+    /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
+    /// are little more than a usize address in disguise.
+    ///
+    /// This is different from [`Self::from_exposed_addr`], which creates a pointer that picks up a
+    /// previously exposed provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::without_provenance`] on each element.
+    fn without_provenance(addr: Self::Usize) -> Self;
+
     /// Creates a new pointer with the given address.
     ///
     /// This performs the same operation as a cast, but copies the *address-space* and
@@ -118,6 +131,14 @@ where
         unsafe { core::mem::transmute_copy(&self) }
     }
 
+    #[inline]
+    fn without_provenance(addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Integer-to-pointer transmutes are valid (if you are okay with not getting any
+        // provenance).
+        unsafe { core::mem::transmute_copy(&addr) }
+    }
+
     #[inline]
     fn with_addr(self, addr: Self::Usize) -> Self {
         // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 5cb27af4fdeba..5323bb74f6fe2 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -39,6 +39,19 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::addr`] on each element.
     fn addr(self) -> Self::Usize;
 
+    /// Convert an address to a pointer without giving it any provenance.
+    ///
+    /// Without provenance, this pointer is not associated with any actual allocation. Such a
+    /// no-provenance pointer may be used for zero-sized memory accesses (if suitably aligned), but
+    /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
+    /// are little more than a usize address in disguise.
+    ///
+    /// This is different from [`Self::from_exposed_addr`], which creates a pointer that picks up a
+    /// previously exposed provenance.
+    ///
+    /// Equivalent to calling [`core::ptr::without_provenance`] on each element.
+    fn without_provenance(addr: Self::Usize) -> Self;
+
     /// Creates a new pointer with the given address.
     ///
     /// This performs the same operation as a cast, but copies the *address-space* and
@@ -115,6 +128,14 @@ where
         unsafe { core::mem::transmute_copy(&self) }
     }
 
+    #[inline]
+    fn without_provenance(addr: Self::Usize) -> Self {
+        // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.
+        // SAFETY: Integer-to-pointer transmutes are valid (if you are okay with not getting any
+        // provenance).
+        unsafe { core::mem::transmute_copy(&addr) }
+    }
+
     #[inline]
     fn with_addr(self, addr: Self::Usize) -> Self {
         // FIXME(strict_provenance_magic): I am magic and should be a compiler intrinsic.

From def711f17ba31ab85a292a3eb853e5f39c1976bc Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 23 Mar 2024 11:47:11 +0100
Subject: [PATCH 12/74] rename ptr::from_exposed_addr ->
 ptr::with_exposed_provenance

---
 crates/core_simd/src/simd/ptr/const_ptr.rs |  8 ++++----
 crates/core_simd/src/simd/ptr/mut_ptr.rs   |  8 ++++----
 crates/core_simd/tests/pointers.rs         | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index e217d1c8c87ca..3ec9fccbff954 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -51,13 +51,13 @@ pub trait SimdConstPtr: Copy + Sealed {
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::from_exposed_addr`].
+    /// in [`Self::with_exposed_provenance`].
     fn expose_addr(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`core::ptr::from_exposed_addr`] on each element.
-    fn from_exposed_addr(addr: Self::Usize) -> Self;
+    /// Equivalent to calling [`core::ptr::with_exposed_provenance`] on each element.
+    fn with_exposed_provenance(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
@@ -137,7 +137,7 @@ where
     }
 
     #[inline]
-    fn from_exposed_addr(addr: Self::Usize) -> Self {
+    fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
         unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
     }
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 5cb27af4fdeba..1142839e213b9 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -48,13 +48,13 @@ pub trait SimdMutPtr: Copy + Sealed {
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::from_exposed_addr`].
+    /// in [`Self::with_exposed_provenance`].
     fn expose_addr(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`core::ptr::from_exposed_addr_mut`] on each element.
-    fn from_exposed_addr(addr: Self::Usize) -> Self;
+    /// Equivalent to calling [`core::ptr::with_exposed_provenance_mut`] on each element.
+    fn with_exposed_provenance(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
@@ -134,7 +134,7 @@ where
     }
 
     #[inline]
-    fn from_exposed_addr(addr: Self::Usize) -> Self {
+    fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
         unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
     }
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index b9f32d16e01d1..5984fdae2f9b4 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -80,10 +80,10 @@ mod const_ptr {
             );
         }
 
-        fn from_exposed_addr<const LANES: usize>() {
+        fn with_exposed_provenance<const LANES: usize>() {
             test_helpers::test_unary_elementwise(
-                &Simd::<*const u32, LANES>::from_exposed_addr,
-                &core::ptr::from_exposed_addr::<u32>,
+                &Simd::<*const u32, LANES>::with_exposed_provenance,
+                &core::ptr::with_exposed_provenance::<u32>,
                 &|_| true,
             );
         }
@@ -103,10 +103,10 @@ mod mut_ptr {
             );
         }
 
-        fn from_exposed_addr<const LANES: usize>() {
+        fn with_exposed_provenance<const LANES: usize>() {
             test_helpers::test_unary_elementwise(
-                &Simd::<*mut u32, LANES>::from_exposed_addr,
-                &core::ptr::from_exposed_addr_mut::<u32>,
+                &Simd::<*mut u32, LANES>::with_exposed_provenance,
+                &core::ptr::with_exposed_provenance_mut::<u32>,
                 &|_| true,
             );
         }

From c7a9561fe2fdcda49a30965a5b0225b17711cfc6 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 23 Mar 2024 23:00:53 +0100
Subject: [PATCH 13/74] also rename the SIMD intrinsic

---
 crates/core_simd/src/simd/ptr/const_ptr.rs | 2 +-
 crates/core_simd/src/simd/ptr/mut_ptr.rs   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index 3ec9fccbff954..4d2fe999ca6fe 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -139,7 +139,7 @@ where
     #[inline]
     fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
+        unsafe { core::intrinsics::simd::simd_with_exposed_provenance(addr) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 1142839e213b9..b3437b9c49966 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -136,7 +136,7 @@ where
     #[inline]
     fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
+        unsafe { core::intrinsics::simd::simd_with_exposed_provenance(addr) }
     }
 
     #[inline]

From 61daba974050e2fb479927f9b5c4a1518e61e674 Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Mon, 25 Mar 2024 11:02:02 -0700
Subject: [PATCH 14/74] Import the 2021 prelude in the core crate

---
 crates/core_simd/src/lib.rs    | 4 ----
 crates/core_simd/src/vector.rs | 1 -
 2 files changed, 5 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 7a161b7e01d25..48514e52587f0 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -45,10 +45,6 @@
 #![unstable(feature = "portable_simd", issue = "86656")]
 //! Portable SIMD module.
 
-#[prelude_import]
-#[allow(unused_imports)]
-use core::prelude::v1::*;
-
 #[path = "mod.rs"]
 mod core_simd;
 pub use self::core_simd::simd;
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 6c8205b112c31..8dbdfc0e1fe03 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -4,7 +4,6 @@ use crate::simd::{
     ptr::{SimdConstPtr, SimdMutPtr},
     LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle,
 };
-use core::convert::{TryFrom, TryInto};
 
 /// A SIMD vector with the shape of `[T; N]` but the operations of `T`.
 ///

From bfacd1473f244d3a9b6ea837c14b888ee426171c Mon Sep 17 00:00:00 2001
From: WANG Rui <wangrui@loongson.cn>
Date: Wed, 20 Mar 2024 10:02:44 +0800
Subject: [PATCH 15/74] Add loongarch64 vendor conversions

---
 crates/core_simd/src/lib.rs                |  1 +
 crates/core_simd/src/vendor.rs             |  3 +++
 crates/core_simd/src/vendor/loongarch64.rs | 31 ++++++++++++++++++++++
 3 files changed, 35 insertions(+)
 create mode 100644 crates/core_simd/src/vendor/loongarch64.rs

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 7a161b7e01d25..736beb84d2988 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -30,6 +30,7 @@
     all(target_arch = "arm", target_feature = "v7"),
     feature(stdarch_arm_neon_intrinsics)
 )]
+#![cfg_attr(target_arch = "loongarch64", feature(stdarch_loongarch))]
 #![cfg_attr(
     any(target_arch = "powerpc", target_arch = "powerpc64"),
     feature(stdarch_powerpc)
diff --git a/crates/core_simd/src/vendor.rs b/crates/core_simd/src/vendor.rs
index 1a34a3a8de5c4..57536e4fc77dc 100644
--- a/crates/core_simd/src/vendor.rs
+++ b/crates/core_simd/src/vendor.rs
@@ -29,3 +29,6 @@ mod arm;
 
 #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
 mod powerpc;
+
+#[cfg(target_arch = "loongarch64")]
+mod loongarch64;
diff --git a/crates/core_simd/src/vendor/loongarch64.rs b/crates/core_simd/src/vendor/loongarch64.rs
new file mode 100644
index 0000000000000..1290bc166b2b8
--- /dev/null
+++ b/crates/core_simd/src/vendor/loongarch64.rs
@@ -0,0 +1,31 @@
+use crate::simd::*;
+use core::arch::loongarch64::*;
+
+from_transmute! { unsafe u8x16 => v16u8 }
+from_transmute! { unsafe u8x32 => v32u8 }
+from_transmute! { unsafe i8x16 => v16i8 }
+from_transmute! { unsafe i8x32 => v32i8 }
+
+from_transmute! { unsafe u16x8 => v8u16 }
+from_transmute! { unsafe u16x16 => v16u16 }
+from_transmute! { unsafe i16x8 => v8i16 }
+from_transmute! { unsafe i16x16 => v16i16 }
+
+from_transmute! { unsafe u32x4 => v4u32 }
+from_transmute! { unsafe u32x8 => v8u32 }
+from_transmute! { unsafe i32x4 => v4i32 }
+from_transmute! { unsafe i32x8 => v8i32 }
+from_transmute! { unsafe f32x4 => v4f32 }
+from_transmute! { unsafe f32x8 => v8f32 }
+
+from_transmute! { unsafe u64x2 => v2u64 }
+from_transmute! { unsafe u64x4 => v4u64 }
+from_transmute! { unsafe i64x2 => v2i64 }
+from_transmute! { unsafe i64x4 => v4i64 }
+from_transmute! { unsafe f64x2 => v2f64 }
+from_transmute! { unsafe f64x4 => v4f64 }
+
+from_transmute! { unsafe usizex2 => v2u64 }
+from_transmute! { unsafe usizex4 => v4u64 }
+from_transmute! { unsafe isizex2 => v2i64 }
+from_transmute! { unsafe isizex4 => v4i64 }

From b03986147d90adecd96e4d651d2959127a32d535 Mon Sep 17 00:00:00 2001
From: joboet <jonasboettiger@icloud.com>
Date: Wed, 3 Apr 2024 15:17:00 +0200
Subject: [PATCH 16/74] rename `expose_addr` to `expose_provenance`

---
 crates/core_simd/src/simd/ptr/const_ptr.rs | 10 +++++-----
 crates/core_simd/src/simd/ptr/mut_ptr.rs   | 10 +++++-----
 crates/core_simd/tests/pointers.rs         |  6 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index 4d2fe999ca6fe..0f1719206c9ce 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -50,9 +50,9 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::with_addr`] on each element.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
-    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::with_exposed_provenance`].
-    fn expose_addr(self) -> Self::Usize;
+    /// Exposes the "provenance" part of the pointer for future use in
+    /// [`Self::with_exposed_provenance`] and returns the "address" portion.
+    fn expose_provenance(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
@@ -131,9 +131,9 @@ where
     }
 
     #[inline]
-    fn expose_addr(self) -> Self::Usize {
+    fn expose_provenance(self) -> Self::Usize {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_expose_addr(self) }
+        unsafe { core::intrinsics::simd::simd_expose_provenance(self) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index b3437b9c49966..7ba996d149c0c 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -47,9 +47,9 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::with_addr`] on each element.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
-    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::with_exposed_provenance`].
-    fn expose_addr(self) -> Self::Usize;
+    /// Exposes the "provenance" part of the pointer for future use in
+    /// [`Self::with_exposed_provenance`] and returns the "address" portion.
+    fn expose_provenance(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
@@ -128,9 +128,9 @@ where
     }
 
     #[inline]
-    fn expose_addr(self) -> Self::Usize {
+    fn expose_provenance(self) -> Self::Usize {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_expose_addr(self) }
+        unsafe { core::intrinsics::simd::simd_expose_provenance(self) }
     }
 
     #[inline]
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index 5984fdae2f9b4..90bfc5d5fd6a5 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -32,10 +32,10 @@ macro_rules! common_tests {
                 );
             }
 
-            fn expose_addr<const LANES: usize>() {
+            fn expose_provenance<const LANES: usize>() {
                 test_helpers::test_unary_elementwise(
-                    &Simd::<*$constness u32, LANES>::expose_addr,
-                    &<*$constness u32>::expose_addr,
+                    &Simd::<*$constness u32, LANES>::expose_provenance,
+                    &<*$constness u32>::expose_provenance,
                     &|_| true,
                 );
             }

From 0d6e714830574ed8a8afba6719db4837ce3b6b96 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 6 Apr 2024 22:13:18 +0200
Subject: [PATCH 17/74] run Miri tests on CI

---
 .github/workflows/ci.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b292be2d6f999..a49cfa44a74d9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -273,3 +273,15 @@ jobs:
         run: |
           echo "Found AVX features: $CPU_FEATURE"
           RUSTFLAGS="-Dwarnings -Ctarget-feature=$CPU_FEATURE" cargo test --all-targets --no-default-features ${{ matrix.features }}
+
+  miri:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: |
+          rustup update nightly --no-self-update
+          rustup default nightly
+          rustup component add miri rust-src
+      - name: Test (Miri)
+        run: cargo miri test

From 55339322beaef8a56ede2b9130e5479cf3e526f3 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Tue, 9 Apr 2024 07:23:14 +0200
Subject: [PATCH 18/74] do not run the 4-lane tests in Miri

---
 crates/test_helpers/src/lib.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 51b860a863560..c55099434c8c1 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -539,12 +539,12 @@ macro_rules! test_lanes {
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_1 1;
                     lanes_2 2;
-                    lanes_4 4;
                 );
 
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 $crate::test_lanes_helper!(
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_4 4;
                     lanes_8 8;
                     lanes_16 16;
                     lanes_32 32;
@@ -553,17 +553,17 @@ macro_rules! test_lanes {
 
                 #[cfg(feature = "all_lane_counts")]
                 $crate::test_lanes_helper!(
-                    // test some odd and even non-power-of-2 lengths on miri
+                    // test one non-power-of-2 length on miri
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_3 3;
-                    lanes_5 5;
-                    lanes_6 6;
                 );
 
                 #[cfg(feature = "all_lane_counts")]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 $crate::test_lanes_helper!(
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
+                    lanes_5 5;
+                    lanes_6 6;
                     lanes_7 7;
                     lanes_9 9;
                     lanes_10 10;

From f79ce11d54f2f2e849c34539d0fdcdb4b4d71b28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Konrad=20H=C3=B6ffner?= <konrad.hoeffner@uni-leipzig.de>
Date: Wed, 10 Apr 2024 10:56:51 +0200
Subject: [PATCH 19/74] document PartialOrd difference to simd_min and simd_max

Avoid migration pitfall from packed_simd as described in #402.
---
 crates/core_simd/src/vector.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 6c8205b112c31..4bcbba727a2f7 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -925,6 +925,7 @@ where
     }
 }
 
+/// Lexicographic order. For the SIMD elementwise minimum and maximum, use simd_min and simd_max instead.
 impl<T, const N: usize> PartialOrd for Simd<T, N>
 where
     LaneCount<N>: SupportedLaneCount,

From ef5f073171234fa4816160d83be494305b4e078c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Konrad=20H=C3=B6ffner?= <konrad.hoeffner@uni-leipzig.de>
Date: Wed, 10 Apr 2024 11:32:51 +0200
Subject: [PATCH 20/74] document Ord difference to simd_min and simd_max

---
 crates/core_simd/src/vector.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 4bcbba727a2f7..c4e9ee0684cf5 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -945,6 +945,7 @@ where
 {
 }
 
+/// Lexicographic order. For the SIMD elementwise minimum and maximum, use simd_min and simd_max instead.
 impl<T, const N: usize> Ord for Simd<T, N>
 where
     LaneCount<N>: SupportedLaneCount,

From 7ba49ef81ca1cea7f5eb82440b85ba2ab859602d Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Mon, 25 Mar 2024 11:02:02 -0700
Subject: [PATCH 21/74] Import the 2021 prelude in the core crate

---
 crates/core_simd/src/lib.rs    | 4 ----
 crates/core_simd/src/vector.rs | 1 -
 2 files changed, 5 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 736beb84d2988..ecb7c78b4009e 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -46,10 +46,6 @@
 #![unstable(feature = "portable_simd", issue = "86656")]
 //! Portable SIMD module.
 
-#[prelude_import]
-#[allow(unused_imports)]
-use core::prelude::v1::*;
-
 #[path = "mod.rs"]
 mod core_simd;
 pub use self::core_simd::simd;
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 6c8205b112c31..8dbdfc0e1fe03 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -4,7 +4,6 @@ use crate::simd::{
     ptr::{SimdConstPtr, SimdMutPtr},
     LaneCount, Mask, MaskElement, SupportedLaneCount, Swizzle,
 };
-use core::convert::{TryFrom, TryInto};
 
 /// A SIMD vector with the shape of `[T; N]` but the operations of `T`.
 ///

From 2b03143cfeb00df820ae38c5ccf8f59b4496d40d Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 23 Mar 2024 11:47:11 +0100
Subject: [PATCH 22/74] rename ptr::from_exposed_addr ->
 ptr::with_exposed_provenance

---
 crates/core_simd/src/simd/ptr/const_ptr.rs |  8 ++++----
 crates/core_simd/src/simd/ptr/mut_ptr.rs   |  8 ++++----
 crates/core_simd/tests/pointers.rs         | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index 2e7654347c0ae..cab342d61ffc3 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -64,13 +64,13 @@ pub trait SimdConstPtr: Copy + Sealed {
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::from_exposed_addr`].
+    /// in [`Self::with_exposed_provenance`].
     fn expose_addr(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`core::ptr::from_exposed_addr`] on each element.
-    fn from_exposed_addr(addr: Self::Usize) -> Self;
+    /// Equivalent to calling [`core::ptr::with_exposed_provenance`] on each element.
+    fn with_exposed_provenance(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
@@ -158,7 +158,7 @@ where
     }
 
     #[inline]
-    fn from_exposed_addr(addr: Self::Usize) -> Self {
+    fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
         unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
     }
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 5323bb74f6fe2..25fbc37f8652e 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -61,13 +61,13 @@ pub trait SimdMutPtr: Copy + Sealed {
     fn with_addr(self, addr: Self::Usize) -> Self;
 
     /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::from_exposed_addr`].
+    /// in [`Self::with_exposed_provenance`].
     fn expose_addr(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
-    /// Equivalent to calling [`core::ptr::from_exposed_addr_mut`] on each element.
-    fn from_exposed_addr(addr: Self::Usize) -> Self;
+    /// Equivalent to calling [`core::ptr::with_exposed_provenance_mut`] on each element.
+    fn with_exposed_provenance(addr: Self::Usize) -> Self;
 
     /// Calculates the offset from a pointer using wrapping arithmetic.
     ///
@@ -155,7 +155,7 @@ where
     }
 
     #[inline]
-    fn from_exposed_addr(addr: Self::Usize) -> Self {
+    fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
         unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
     }
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index b9f32d16e01d1..5984fdae2f9b4 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -80,10 +80,10 @@ mod const_ptr {
             );
         }
 
-        fn from_exposed_addr<const LANES: usize>() {
+        fn with_exposed_provenance<const LANES: usize>() {
             test_helpers::test_unary_elementwise(
-                &Simd::<*const u32, LANES>::from_exposed_addr,
-                &core::ptr::from_exposed_addr::<u32>,
+                &Simd::<*const u32, LANES>::with_exposed_provenance,
+                &core::ptr::with_exposed_provenance::<u32>,
                 &|_| true,
             );
         }
@@ -103,10 +103,10 @@ mod mut_ptr {
             );
         }
 
-        fn from_exposed_addr<const LANES: usize>() {
+        fn with_exposed_provenance<const LANES: usize>() {
             test_helpers::test_unary_elementwise(
-                &Simd::<*mut u32, LANES>::from_exposed_addr,
-                &core::ptr::from_exposed_addr_mut::<u32>,
+                &Simd::<*mut u32, LANES>::with_exposed_provenance,
+                &core::ptr::with_exposed_provenance_mut::<u32>,
                 &|_| true,
             );
         }

From 6da23374ef93acfea6f2c75d2cf73b7231586156 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 23 Mar 2024 23:00:53 +0100
Subject: [PATCH 23/74] also rename the SIMD intrinsic

---
 crates/core_simd/src/simd/ptr/const_ptr.rs | 2 +-
 crates/core_simd/src/simd/ptr/mut_ptr.rs   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index cab342d61ffc3..fd6e091aed5dc 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -160,7 +160,7 @@ where
     #[inline]
     fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
+        unsafe { core::intrinsics::simd::simd_with_exposed_provenance(addr) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 25fbc37f8652e..2f6684b3f5700 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -157,7 +157,7 @@ where
     #[inline]
     fn with_exposed_provenance(addr: Self::Usize) -> Self {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_from_exposed_addr(addr) }
+        unsafe { core::intrinsics::simd::simd_with_exposed_provenance(addr) }
     }
 
     #[inline]

From b8a18fa96571ce29814d9cf32f3f16eba918a2fa Mon Sep 17 00:00:00 2001
From: joboet <jonasboettiger@icloud.com>
Date: Wed, 3 Apr 2024 15:17:00 +0200
Subject: [PATCH 24/74] rename `expose_addr` to `expose_provenance`

---
 crates/core_simd/src/simd/ptr/const_ptr.rs | 10 +++++-----
 crates/core_simd/src/simd/ptr/mut_ptr.rs   | 10 +++++-----
 crates/core_simd/tests/pointers.rs         |  6 +++---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index fd6e091aed5dc..809ea7cf43b77 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -63,9 +63,9 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::with_addr`] on each element.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
-    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::with_exposed_provenance`].
-    fn expose_addr(self) -> Self::Usize;
+    /// Exposes the "provenance" part of the pointer for future use in
+    /// [`Self::with_exposed_provenance`] and returns the "address" portion.
+    fn expose_provenance(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
@@ -152,9 +152,9 @@ where
     }
 
     #[inline]
-    fn expose_addr(self) -> Self::Usize {
+    fn expose_provenance(self) -> Self::Usize {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_expose_addr(self) }
+        unsafe { core::intrinsics::simd::simd_expose_provenance(self) }
     }
 
     #[inline]
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 2f6684b3f5700..f418f90154e06 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -60,9 +60,9 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// Equivalent to calling [`pointer::with_addr`] on each element.
     fn with_addr(self, addr: Self::Usize) -> Self;
 
-    /// Gets the "address" portion of the pointer, and "exposes" the provenance part for future use
-    /// in [`Self::with_exposed_provenance`].
-    fn expose_addr(self) -> Self::Usize;
+    /// Exposes the "provenance" part of the pointer for future use in
+    /// [`Self::with_exposed_provenance`] and returns the "address" portion.
+    fn expose_provenance(self) -> Self::Usize;
 
     /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
     ///
@@ -149,9 +149,9 @@ where
     }
 
     #[inline]
-    fn expose_addr(self) -> Self::Usize {
+    fn expose_provenance(self) -> Self::Usize {
         // Safety: `self` is a pointer vector
-        unsafe { core::intrinsics::simd::simd_expose_addr(self) }
+        unsafe { core::intrinsics::simd::simd_expose_provenance(self) }
     }
 
     #[inline]
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index 5984fdae2f9b4..90bfc5d5fd6a5 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -32,10 +32,10 @@ macro_rules! common_tests {
                 );
             }
 
-            fn expose_addr<const LANES: usize>() {
+            fn expose_provenance<const LANES: usize>() {
                 test_helpers::test_unary_elementwise(
-                    &Simd::<*$constness u32, LANES>::expose_addr,
-                    &<*$constness u32>::expose_addr,
+                    &Simd::<*$constness u32, LANES>::expose_provenance,
+                    &<*$constness u32>::expose_provenance,
                     &|_| true,
                 );
             }

From 6c6815251a0fc17036998e7ef6a4bfc7b02331e6 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 11 Apr 2024 00:15:29 -0400
Subject: [PATCH 25/74] Pin toolchain

---
 rust-toolchain.toml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 rust-toolchain.toml

diff --git a/rust-toolchain.toml b/rust-toolchain.toml
new file mode 100644
index 0000000000000..5e885801cda21
--- /dev/null
+++ b/rust-toolchain.toml
@@ -0,0 +1,3 @@
+[toolchain]
+channel = "nightly-2024-04-10"
+components = ["rustfmt", "clippy", "miri", "rust-src"]

From 14ae03d7737f9ec5b33256524605ffec96395104 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 11 Apr 2024 00:19:20 -0400
Subject: [PATCH 26/74] Fix docs

---
 crates/core_simd/src/simd/ptr/const_ptr.rs | 2 +-
 crates/core_simd/src/simd/ptr/mut_ptr.rs   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index 809ea7cf43b77..4e09e52f41962 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -49,7 +49,7 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
     /// are little more than a usize address in disguise.
     ///
-    /// This is different from [`Self::from_exposed_addr`], which creates a pointer that picks up a
+    /// This is different from [`Self::with_exposed_provenance`], which creates a pointer that picks up a
     /// previously exposed provenance.
     ///
     /// Equivalent to calling [`core::ptr::without_provenance`] on each element.
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index f418f90154e06..9ddce68aeb32b 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -46,7 +46,7 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// non-zero-sized memory accesses with a no-provenance pointer are UB. No-provenance pointers
     /// are little more than a usize address in disguise.
     ///
-    /// This is different from [`Self::from_exposed_addr`], which creates a pointer that picks up a
+    /// This is different from [`Self::with_exposed_provenance`], which creates a pointer that picks up a
     /// previously exposed provenance.
     ///
     /// Equivalent to calling [`core::ptr::without_provenance`] on each element.

From fe815f775f444b789f90ce3aae08721982ecdfd6 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 11 Apr 2024 00:25:10 -0400
Subject: [PATCH 27/74] Simplify CI

---
 .github/workflows/ci.yml | 35 +++--------------------------------
 1 file changed, 3 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a49cfa44a74d9..d3f79733d3abe 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,11 +17,6 @@ jobs:
 
     steps:
       - uses: actions/checkout@v2
-      - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup component add rustfmt
       - name: Run rustfmt
         run: cargo fmt --all -- --check
 
@@ -51,11 +46,7 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup target add ${{ matrix.target }}
-          rustup component add clippy
+        run: rustup target add ${{ matrix.target }}
       - name: Run Clippy
         run: cargo clippy --all-targets --target ${{ matrix.target }}
 
@@ -100,10 +91,7 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup target add ${{ matrix.target }}
+        run: rustup target add ${{ matrix.target }}
 
       - name: Configure RUSTFLAGS
         shell: bash
@@ -156,10 +144,6 @@ jobs:
           - { name: simd128, RUSTFLAGS: "-C target-feature=+simd128" }
     steps:
       - uses: actions/checkout@v2
-      - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
       - name: Install wasm-pack
         run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
       - name: Test (debug)
@@ -203,11 +187,7 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup target add ${{ matrix.target }}
-          rustup component add rust-src
+        run: rustup target add ${{ matrix.target }}
 
       - name: Install Cross
         # Equivalent to `cargo install cross`, but downloading a prebuilt
@@ -259,10 +239,6 @@ jobs:
 
     steps:
       - uses: actions/checkout@v2
-      - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
       - name: Detect AVX512
         run: echo "CPU_FEATURE=$(lscpu | grep -o avx512[a-z]* | sed s/avx/+avx/ | tr '\n' ',' )" >> $GITHUB_ENV
       - name: Check build
@@ -278,10 +254,5 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Setup Rust
-        run: |
-          rustup update nightly --no-self-update
-          rustup default nightly
-          rustup component add miri rust-src
       - name: Test (Miri)
         run: cargo miri test

From 4fa3e88811c088d8d3da7e99dc1602f2b48156f9 Mon Sep 17 00:00:00 2001
From: Gary Guo <gary@garyguo.net>
Date: Fri, 19 Apr 2024 16:17:02 +0100
Subject: [PATCH 28/74] Stabilise `inline_const`

---
 crates/core_simd/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 48514e52587f0..331b66262490c 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -7,7 +7,6 @@
     convert_float_to_int,
     core_intrinsics,
     decl_macro,
-    inline_const,
     intra_doc_pointers,
     repr_simd,
     simd_ffi,

From e72b450149f084c8b15539596f1bba12ca0ae36c Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 29 Apr 2024 21:00:20 -0400
Subject: [PATCH 29/74] Make splat const fn

---
 crates/core_simd/src/lib.rs    |  1 +
 crates/core_simd/src/vector.rs | 31 ++++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index ecb7c78b4009e..4506feb80e340 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,5 +1,6 @@
 #![no_std]
 #![feature(
+    const_eval_select,
     const_intrinsic_copy,
     const_refs_to_cell,
     const_maybe_uninit_as_mut_ptr,
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index cff274dc85e82..9cadc51ba29ce 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -144,14 +144,31 @@ where
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
     #[inline]
-    pub fn splat(value: T) -> Self {
-        // This is preferred over `[value; N]`, since it's explicitly a splat:
-        // https://github.com/rust-lang/rust/issues/97804
-        struct Splat;
-        impl<const N: usize> Swizzle<N> for Splat {
-            const INDEX: [usize; N] = [0; N];
+    pub const fn splat(value: T) -> Self {
+        const fn splat_const<T, const N: usize>(value: T) -> Simd<T, N>
+        where
+            T: SimdElement,
+            LaneCount<N>: SupportedLaneCount,
+        {
+            Simd::from_array([value; N])
         }
-        Splat::swizzle::<T, 1>(Simd::<T, 1>::from([value]))
+
+        fn splat_rt<T, const N: usize>(value: T) -> Simd<T, N>
+        where
+            T: SimdElement,
+            LaneCount<N>: SupportedLaneCount,
+        {
+            // This is preferred over `[value; N]`, since it's explicitly a splat:
+            // https://github.com/rust-lang/rust/issues/97804
+            struct Splat;
+            impl<const N: usize> Swizzle<N> for Splat {
+                const INDEX: [usize; N] = [0; N];
+            }
+
+            Splat::swizzle::<T, 1>(Simd::<T, 1>::from([value]))
+        }
+
+        core::intrinsics::const_eval_select((value,), splat_const, splat_rt)
     }
 
     /// Returns an array reference containing the entire SIMD vector.

From 9314128affcbbe517c8f083645c447980ad6b236 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 29 Apr 2024 21:08:24 -0400
Subject: [PATCH 30/74] Lock rust version

---
 crates/core_simd/src/lib.rs | 1 -
 rust-toolchain.toml         | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index ecb7c78b4009e..f13b9004ab304 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -7,7 +7,6 @@
     convert_float_to_int,
     core_intrinsics,
     decl_macro,
-    inline_const,
     intra_doc_pointers,
     repr_simd,
     simd_ffi,
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 5e885801cda21..8f75e55722441 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2024-04-10"
+channel = "nightly-2024-04-29"
 components = ["rustfmt", "clippy", "miri", "rust-src"]

From 4fcf18b5681530c36413f30cdeac9635c6dc3488 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 29 Apr 2024 21:45:48 -0400
Subject: [PATCH 31/74] Fix macos tests in CI

---
 .github/workflows/ci.yml | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d3f79733d3abe..ef6a9e611ac04 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -56,26 +56,19 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        target: [x86_64-pc-windows-msvc, i686-pc-windows-msvc, i586-pc-windows-msvc, x86_64-unknown-linux-gnu, x86_64-apple-darwin]
+        target: [x86_64-pc-windows-msvc, i686-pc-windows-msvc, i586-pc-windows-msvc, x86_64-unknown-linux-gnu]
         # `default` means we use the default target config for the target,
         # `native` means we run with `-Ctarget-cpu=native`, and anything else is
         # an arg to `-Ctarget-feature`
         target_feature: [default, native, +sse3, +ssse3, +sse4.1, +sse4.2, +avx, +avx2]
 
         exclude:
-          # The macos runners seem to only reliably support up to `avx`.
-          - { target: x86_64-apple-darwin, target_feature: +avx2 }
-          # These features are statically known to be present for all 64 bit
-          # macs, and thus are covered by the `default` test
-          - { target: x86_64-apple-darwin, target_feature: +sse3 }
-          - { target: x86_64-apple-darwin, target_feature: +ssse3 }
           # -Ctarget-cpu=native sounds like bad-news if target != host
           - { target: i686-pc-windows-msvc, target_feature: native }
           - { target: i586-pc-windows-msvc, target_feature: native }
 
         include:
           # Populate the `matrix.os` field
-          - { target: x86_64-apple-darwin,      os: macos-latest }
           - { target: x86_64-unknown-linux-gnu, os: ubuntu-latest }
           - { target: x86_64-pc-windows-msvc,   os: windows-latest }
           - { target: i686-pc-windows-msvc,     os: windows-latest }
@@ -133,6 +126,35 @@ jobs:
         run: cargo doc --verbose --target=${{ matrix.target }}
         env:
           RUSTDOCFLAGS: -Dwarnings
+    
+  macos-tests:
+    name: ${{ matrix.target }}
+    runs-on: macos-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - aarch64-apple-darwin
+          - x86_64-apple-darwin
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Rust
+        run: rustup target add ${{ matrix.target }}
+
+      - name: Configure RUSTFLAGS
+        shell: bash
+        run: echo "RUSTFLAGS=-Dwarnings" >> $GITHUB_ENV
+
+      - name: Test (debug)
+        run: cargo test --verbose --target=${{ matrix.target }}
+
+      - name: Test (release)
+        run: cargo test --verbose --target=${{ matrix.target }} --release
+
+      - name: Generate docs
+        run: cargo doc --verbose --target=${{ matrix.target }}
+        env:
+          RUSTDOCFLAGS: -Dwarnings
 
   wasm-tests:
     name: "wasm (firefox, ${{ matrix.name }})"

From 3927f4adfa7df520703443e622a395622e68377b Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Thu, 28 Mar 2024 14:30:32 +0000
Subject: [PATCH 32/74] Add `size_of`, `size_of_val`, `align_of`, and
 `align_of_val` to the prelude

Many, many projects use `size_of` to get the size of a type. However,
it's also often equally easy to hardcode a size (e.g. `8` instead of
`size_of::<u64>()`). Minimizing friction in the use of `size_of` helps
ensure that people use it and make code more self-documenting.

The name `size_of` is unambiguous: the name alone, without any prefix or
path, is self-explanatory and unmistakeable for any other functionality.
Adding it to the prelude cannot produce any name conflicts, as any local
definition will silently shadow the one from the prelude. Thus, we don't
need to wait for a new edition prelude to add it.

Add `size_of_val`, `align_of`, and `align_of_val` as well, with similar
justification: widely useful, self-explanatory, unmistakeable for
anything else, won't produce conflicts.
---
 crates/core_simd/src/simd/ptr/const_ptr.rs | 2 +-
 crates/core_simd/src/simd/ptr/mut_ptr.rs   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index 0f1719206c9ce..cbffbc564cfed 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -96,7 +96,7 @@ where
     fn cast<U>(self) -> Self::CastPtr<U> {
         // SimdElement currently requires zero-sized metadata, so this should never fail.
         // If this ever changes, `simd_cast_ptr` should produce a post-mono error.
-        use core::{mem::size_of, ptr::Pointee};
+        use core::ptr::Pointee;
         assert_eq!(size_of::<<T as Pointee>::Metadata>(), 0);
         assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
 
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 7ba996d149c0c..6bc6ca3ac42dc 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -93,7 +93,7 @@ where
     fn cast<U>(self) -> Self::CastPtr<U> {
         // SimdElement currently requires zero-sized metadata, so this should never fail.
         // If this ever changes, `simd_cast_ptr` should produce a post-mono error.
-        use core::{mem::size_of, ptr::Pointee};
+        use core::ptr::Pointee;
         assert_eq!(size_of::<<T as Pointee>::Metadata>(), 0);
         assert_eq!(size_of::<<U as Pointee>::Metadata>(), 0);
 

From 435f7068b0acd39dc48ef84793e658e8eb89e342 Mon Sep 17 00:00:00 2001
From: blyxyas <blyxyas@gmail.com>
Date: Sat, 18 May 2024 18:08:11 +0200
Subject: [PATCH 33/74] Fix typos (taking into account review comments)

---
 crates/core_simd/src/ops.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index d8e10eeaa1a2a..dd7303a97b197 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -122,7 +122,7 @@ macro_rules! for_base_types {
                     #[inline]
                     #[must_use = "operator returns a new vector without mutating the inputs"]
                     // TODO: only useful for int Div::div, but we hope that this
-                    // will essentially always always get inlined anyway.
+                    // will essentially always get inlined anyway.
                     #[track_caller]
                     fn $call(self, rhs: Self) -> Self::Output {
                         $macro_impl!(self, rhs, $inner, $scalar)

From 675401b04bd9662325eac63774e476470e272743 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 5 Jun 2024 13:51:20 -0400
Subject: [PATCH 34/74] Add extend special swizzle fn, and implement special
 swizzle fns for masks

---
 crates/core_simd/src/swizzle.rs | 180 +++++++++++++++++++++++++++++++-
 1 file changed, 179 insertions(+), 1 deletion(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 71110bb282018..39e763494395a 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -312,7 +312,9 @@ where
     ///
     /// ```
     /// # #![feature(portable_simd)]
-    /// # use core::simd::Simd;
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::Simd;
     /// let a = Simd::from_array([0, 4, 1, 5]);
     /// let b = Simd::from_array([2, 6, 3, 7]);
     /// let (x, y) = a.deinterleave(b);
@@ -383,4 +385,180 @@ where
         }
         Resize::<N>::concat_swizzle(self, Simd::splat(value))
     }
+
+    /// Extract a vector from another vector.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::u32x4;
+    /// let x = u32x4::from_array([0, 1, 2, 3]);
+    /// assert_eq!(x.extract::<1, 2>().to_array(), [1, 2]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn extract<const START: usize, const LEN: usize>(self) -> Simd<T, LEN>
+    where
+        LaneCount<LEN>: SupportedLaneCount,
+    {
+        struct Extract<const N: usize, const START: usize>;
+        impl<const N: usize, const START: usize, const LEN: usize> Swizzle<LEN> for Extract<N, START> {
+            const INDEX: [usize; LEN] = const {
+                assert!(START + LEN <= N, "index out of bounds");
+                let mut index = [0; LEN];
+                let mut i = 0;
+                while i < LEN {
+                    index[i] = START + i;
+                    i += 1;
+                }
+                index
+            };
+        }
+        Extract::<N, START>::swizzle(self)
+    }
+}
+
+impl<T, const N: usize> Mask<T, N>
+where
+    T: MaskElement,
+    LaneCount<N>: SupportedLaneCount,
+{
+    /// Reverse the order of the elements in the mask.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn reverse(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().reverse()) }
+    }
+
+    /// Rotates the mask such that the first `OFFSET` elements of the slice move to the end
+    /// while the last `self.len() - OFFSET` elements move to the front. After calling `rotate_elements_left`,
+    /// the element previously at index `OFFSET` will become the first element in the slice.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn rotate_elements_left<const OFFSET: usize>(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_left::<OFFSET>()) }
+    }
+
+    /// Rotates the mask such that the first `self.len() - OFFSET` elements of the mask move to
+    /// the end while the last `OFFSET` elements move to the front. After calling `rotate_elements_right`,
+    /// the element previously at index `self.len() - OFFSET` will become the first element in the slice.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn rotate_elements_right<const OFFSET: usize>(self) -> Self {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_right::<OFFSET>()) }
+    }
+
+    /// Interleave two masks.
+    ///
+    /// The resulting masks contain elements taken alternatively from `self` and `other`, first
+    /// filling the first result, and then the second.
+    ///
+    /// The reverse of this operation is [`Mask::deinterleave`].
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let a = mask32x4::from_array([false, true, false, true]);
+    /// let b = mask32x4::from_array([false, false, true, true]);
+    /// let (x, y) = a.interleave(b);
+    /// assert_eq!(x.to_array(), [false, false, true, false]);
+    /// assert_eq!(y.to_array(), [false, true, true, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn interleave(self, other: Self) -> (Self, Self) {
+        // Safety: swizzles are safe for masks
+        let (lo, hi) = self.to_int().interleave(other.to_int());
+        unsafe { (Self::from_int_unchecked(lo), Self::from_int_unchecked(hi)) }
+    }
+
+    /// Deinterleave two masks.
+    ///
+    /// The first result takes every other element of `self` and then `other`, starting with
+    /// the first element.
+    ///
+    /// The second result takes every other element of `self` and then `other`, starting with
+    /// the second element.
+    ///
+    /// The reverse of this operation is [`Mask::interleave`].
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let a = mask32x4::from_array([false, true, false, true]);
+    /// let b = mask32x4::from_array([false, false, true, true]);
+    /// let (x, y) = a.deinterleave(b);
+    /// assert_eq!(x.to_array(), [false, false, false, true]);
+    /// assert_eq!(y.to_array(), [true, true, false, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn deinterleave(self, other: Self) -> (Self, Self) {
+        // Safety: swizzles are safe for masks
+        let (even, odd) = self.to_int().deinterleave(other.to_int());
+        unsafe {
+            (
+                Self::from_int_unchecked(even),
+                Self::from_int_unchecked(odd),
+            )
+        }
+    }
+
+    /// Resize a mask.
+    ///
+    /// If `M` > `N`, extends the length of a mask, setting the new elements to `value`.
+    /// If `M` < `N`, truncates the mask to the first `M` elements.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let x = mask32x4::from_array([false, true, true, false]);
+    /// assert_eq!(x.resize::<8>(true).to_array(), [false, true, true, false, true, true, true, true]);
+    /// assert_eq!(x.resize::<2>(true).to_array(), [false, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn resize<const M: usize>(self, value: bool) -> Mask<T, M>
+    where
+        LaneCount<M>: SupportedLaneCount,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe {
+            Mask::<T, M>::from_int_unchecked(self.to_int().resize::<M>(if value {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
+    }
+
+    /// Extract a vector from another vector.
+    ///
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::mask32x4;
+    /// let x = mask32x4::from_array([false, true, true, false]);
+    /// assert_eq!(x.extract::<1, 2>().to_array(), [true, true]);
+    /// ```
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn extract<const START: usize, const LEN: usize>(self) -> Mask<T, LEN>
+    where
+        LaneCount<LEN>: SupportedLaneCount,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe { Mask::<T, LEN>::from_int_unchecked(self.to_int().extract::<START, LEN>()) }
+    }
 }

From 3733375f53bb968e18b56b7a6a3d1c1d51f15e4b Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 5 Jun 2024 13:56:13 -0400
Subject: [PATCH 35/74] Remove bitmask vectors in favor of extracting bitmasks

---
 crates/core_simd/src/masks.rs            | 42 ------------------
 crates/core_simd/src/masks/bitmask.rs    | 17 -------
 crates/core_simd/src/masks/full_masks.rs | 56 ------------------------
 crates/core_simd/tests/masks.rs          | 42 ------------------
 4 files changed, 157 deletions(-)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index e6e27c76a5e99..d8e984d9e9960 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -308,48 +308,6 @@ where
         Self(mask_impl::Mask::from_bitmask_integer(bitmask))
     }
 
-    /// Create a bitmask vector from a mask.
-    ///
-    /// Each bit is set if the corresponding element in the mask is `true`.
-    /// The remaining bits are unset.
-    ///
-    /// The bits are packed into the first N bits of the vector:
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
-    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::mask32x8;
-    /// let mask = mask32x8::from_array([true, false, true, false, false, false, true, false]);
-    /// assert_eq!(mask.to_bitmask_vector()[0], 0b01000101);
-    /// ```
-    #[inline]
-    #[must_use = "method returns a new integer and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        self.0.to_bitmask_vector()
-    }
-
-    /// Create a mask from a bitmask vector.
-    ///
-    /// For each bit, if it is set, the corresponding element in the mask is set to `true`.
-    ///
-    /// The bits are packed into the first N bits of the vector:
-    /// ```
-    /// # #![feature(portable_simd)]
-    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
-    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
-    /// # use simd::{mask32x8, u8x8};
-    /// let bitmask = u8x8::from_array([0b01000101, 0, 0, 0, 0, 0, 0, 0]);
-    /// assert_eq!(
-    ///     mask32x8::from_bitmask_vector(bitmask),
-    ///     mask32x8::from_array([true, false, true, false, false, false, true, false]),
-    /// );
-    /// ```
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        Self(mask_impl::Mask::from_bitmask_vector(bitmask))
-    }
-
     /// Find the index of the first set element.
     ///
     /// ```
diff --git a/crates/core_simd/src/masks/bitmask.rs b/crates/core_simd/src/masks/bitmask.rs
index 96c553426ee74..db4312d5bf88a 100644
--- a/crates/core_simd/src/masks/bitmask.rs
+++ b/crates/core_simd/src/masks/bitmask.rs
@@ -122,23 +122,6 @@ where
         unsafe { Self(core::intrinsics::simd::simd_bitmask(value), PhantomData) }
     }
 
-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        let mut bitmask = Simd::splat(0);
-        bitmask.as_mut_array()[..self.0.as_ref().len()].copy_from_slice(self.0.as_ref());
-        bitmask
-    }
-
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
-        let len = bytes.as_ref().len();
-        bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
-        Self(bytes, PhantomData)
-    }
-
     #[inline]
     pub fn to_bitmask_integer(self) -> u64 {
         let mut bitmask = [0u8; 8];
diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 87f031a9f367a..0c041b32761bc 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -140,62 +140,6 @@ where
         unsafe { Mask(core::intrinsics::simd::simd_cast(self.0)) }
     }
 
-    #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
-    pub fn to_bitmask_vector(self) -> Simd<u8, N> {
-        let mut bitmask = Simd::splat(0);
-
-        // Safety: Bytes is the right size array
-        unsafe {
-            // Compute the bitmask
-            let mut bytes: <LaneCount<N> as SupportedLaneCount>::BitMask =
-                core::intrinsics::simd::simd_bitmask(self.0);
-
-            // LLVM assumes bit order should match endianness
-            if cfg!(target_endian = "big") {
-                for x in bytes.as_mut() {
-                    *x = x.reverse_bits()
-                }
-                if N % 8 > 0 {
-                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
-                }
-            }
-
-            bitmask.as_mut_array()[..bytes.as_ref().len()].copy_from_slice(bytes.as_ref());
-        }
-
-        bitmask
-    }
-
-    #[inline]
-    #[must_use = "method returns a new mask and does not mutate the original value"]
-    pub fn from_bitmask_vector(bitmask: Simd<u8, N>) -> Self {
-        let mut bytes = <LaneCount<N> as SupportedLaneCount>::BitMask::default();
-
-        // Safety: Bytes is the right size array
-        unsafe {
-            let len = bytes.as_ref().len();
-            bytes.as_mut().copy_from_slice(&bitmask.as_array()[..len]);
-
-            // LLVM assumes bit order should match endianness
-            if cfg!(target_endian = "big") {
-                for x in bytes.as_mut() {
-                    *x = x.reverse_bits();
-                }
-                if N % 8 > 0 {
-                    bytes.as_mut()[N / 8] >>= 8 - N % 8;
-                }
-            }
-
-            // Compute the regular mask
-            Self::from_int_unchecked(core::intrinsics::simd::simd_select_bitmask(
-                bytes,
-                Self::splat(true).to_int(),
-                Self::splat(false).to_int(),
-            ))
-        }
-    }
-
     #[inline]
     unsafe fn to_bitmask_impl<U: ReverseBits, const M: usize>(self) -> U
     where
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index fc6a3476b7c60..2fe0c97407f03 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -134,48 +134,6 @@ macro_rules! test_mask_api {
                 cast_impl::<i64>();
                 cast_impl::<isize>();
             }
-
-            #[test]
-            fn roundtrip_bitmask_vector_conversion() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, false, true, false, false, true, false,
-                    true, true, false, false, false, false, false, true,
-                ];
-                let mask = Mask::<$type, 16>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b01001001, 0b10000011]);
-                assert_eq!(Mask::<$type, 16>::from_bitmask_vector(bitmask), mask);
-            }
-
-            // rust-lang/portable-simd#379
-            #[test]
-            fn roundtrip_bitmask_vector_conversion_small() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, true, true
-                ];
-                let mask = Mask::<$type, 4>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<1>(0).to_ne_bytes()[0], 0b00001101);
-                assert_eq!(Mask::<$type, 4>::from_bitmask_vector(bitmask), mask);
-            }
-
-            /* FIXME doesn't work with non-powers-of-two, yet
-            // rust-lang/portable-simd#379
-            #[cfg(feature = "all_lane_counts")]
-            #[test]
-            fn roundtrip_bitmask_vector_conversion_odd() {
-                use core_simd::simd::ToBytes;
-                let values = [
-                    true, false, true, false, true, true, false, false, false, true, true,
-                ];
-                let mask = Mask::<$type, 11>::from_array(values);
-                let bitmask = mask.to_bitmask_vector();
-                assert_eq!(bitmask.resize::<2>(0).to_ne_bytes()[..2], [0b00110101, 0b00000110]);
-                assert_eq!(Mask::<$type, 11>::from_bitmask_vector(bitmask), mask);
-            }
-            */
         }
     }
 }

From bd92b7ccf3ee7cc6018b1b0e187ea979ed099a67 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 5 Jun 2024 14:17:40 -0400
Subject: [PATCH 36/74] Fix clippy lints

---
 crates/core_simd/src/swizzle.rs | 4 ++--
 crates/core_simd/src/vector.rs  | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 39e763494395a..a4b6138aa0abb 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -473,8 +473,8 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn interleave(self, other: Self) -> (Self, Self) {
-        // Safety: swizzles are safe for masks
         let (lo, hi) = self.to_int().interleave(other.to_int());
+        // Safety: swizzles are safe for masks
         unsafe { (Self::from_int_unchecked(lo), Self::from_int_unchecked(hi)) }
     }
 
@@ -502,8 +502,8 @@ where
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
     pub fn deinterleave(self, other: Self) -> (Self, Self) {
-        // Safety: swizzles are safe for masks
         let (even, odd) = self.to_int().deinterleave(other.to_int());
+        // Safety: swizzles are safe for masks
         unsafe {
             (
                 Self::from_int_unchecked(even),
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 9cadc51ba29ce..fc029548ecac8 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -442,6 +442,9 @@ where
     ///
     /// When the element is disabled, that memory location is not accessed and the corresponding
     /// value from `or` is passed through.
+    ///
+    /// # Safety
+    /// Enabled loads must not exceed the length of `slice`.
     #[must_use]
     #[inline]
     pub unsafe fn load_select_unchecked(
@@ -459,6 +462,9 @@ where
     ///
     /// When the element is disabled, that memory location is not accessed and the corresponding
     /// value from `or` is passed through.
+    ///
+    /// # Safety
+    /// Enabled `ptr` elements must be safe to read as if by `std::ptr::read`.
     #[must_use]
     #[inline]
     pub unsafe fn load_select_ptr(
@@ -1214,6 +1220,7 @@ fn lane_indices<const N: usize>() -> Simd<usize, N>
 where
     LaneCount<N>: SupportedLaneCount,
 {
+    #![allow(clippy::needless_range_loop)]
     let mut index = [0; N];
     for i in 0..N {
         index[i] = i;

From 3c398a08201d9b927f6af09b51bbbbbb9de62e65 Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Thu, 13 Jun 2024 09:58:35 -0700
Subject: [PATCH 37/74] Add arm64ec to the list of architectures to check

---
 .github/workflows/ci.yml | 1 +
 rust-toolchain.toml      | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ef6a9e611ac04..67d10e5553274 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,6 +32,7 @@ jobs:
           - i686-unknown-linux-gnu
           - i586-unknown-linux-gnu
           - aarch64-unknown-linux-gnu
+          - arm64ec-pc-windows-msvc
           - armv7-unknown-linux-gnueabihf
           # non-nightly since https://github.com/rust-lang/rust/pull/113274
           # - mips-unknown-linux-gnu
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 8f75e55722441..811fdb49cdba8 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2024-04-29"
+channel = "nightly-2024-06-13"
 components = ["rustfmt", "clippy", "miri", "rust-src"]

From 227a9d9e06aa6fd24cb287917bf364b54821f777 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Mon, 3 Jun 2024 17:55:07 -0400
Subject: [PATCH 38/74] Fix layout of non-power-of-two length vectors

---
 .github/workflows/ci.yml           | 28 ------------------
 crates/core_simd/Cargo.toml        |  3 +-
 crates/core_simd/src/lane_count.rs |  8 ++---
 crates/core_simd/src/vector.rs     |  2 +-
 crates/core_simd/tests/layout.rs   | 35 ++++++++++++++++++++++
 crates/core_simd/tests/masks.rs    |  1 -
 crates/test_helpers/Cargo.toml     |  3 --
 crates/test_helpers/src/lib.rs     | 47 ++++++++----------------------
 8 files changed, 52 insertions(+), 75 deletions(-)
 create mode 100644 crates/core_simd/tests/layout.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 67d10e5553274..95ba1f2c8c900 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -245,34 +245,6 @@ jobs:
       - name: Test (release)
         run: cross test --verbose --target=${{ matrix.target }} --release
 
-  features:
-    name: "Test cargo features (${{ matrix.simd }} × ${{ matrix.features }})"
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        simd:
-          - ""
-          - "avx512"
-        features:
-          - ""
-          - "--features std"
-          - "--features all_lane_counts"
-          - "--all-features"
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: Detect AVX512
-        run: echo "CPU_FEATURE=$(lscpu | grep -o avx512[a-z]* | sed s/avx/+avx/ | tr '\n' ',' )" >> $GITHUB_ENV
-      - name: Check build
-        if: ${{ matrix.simd == '' }}
-        run: RUSTFLAGS="-Dwarnings" cargo test --all-targets --no-default-features ${{ matrix.features }}
-      - name: Check AVX
-        if: ${{ matrix.simd == 'avx512' && contains(env.CPU_FEATURE, 'avx512') }}
-        run: |
-          echo "Found AVX features: $CPU_FEATURE"
-          RUSTFLAGS="-Dwarnings -Ctarget-feature=$CPU_FEATURE" cargo test --all-targets --no-default-features ${{ matrix.features }}
-
   miri:
     runs-on: ubuntu-latest
     steps:
diff --git a/crates/core_simd/Cargo.toml b/crates/core_simd/Cargo.toml
index b4a8fd70f4c0e..a7a6d43b11d3c 100644
--- a/crates/core_simd/Cargo.toml
+++ b/crates/core_simd/Cargo.toml
@@ -9,10 +9,9 @@ categories = ["hardware-support", "no-std"]
 license = "MIT OR Apache-2.0"
 
 [features]
-default = ["as_crate"]
+default = ["as_crate", "std"]
 as_crate = []
 std = []
-all_lane_counts = []
 
 [target.'cfg(target_arch = "wasm32")'.dev-dependencies]
 wasm-bindgen = "0.2"
diff --git a/crates/core_simd/src/lane_count.rs b/crates/core_simd/src/lane_count.rs
index 4cd7265ed671e..280b27bc9bc6f 100644
--- a/crates/core_simd/src/lane_count.rs
+++ b/crates/core_simd/src/lane_count.rs
@@ -33,10 +33,8 @@ macro_rules! supported_lane_count {
     };
 }
 
-supported_lane_count!(1, 2, 4, 8, 16, 32, 64);
-#[cfg(feature = "all_lane_counts")]
 supported_lane_count!(
-    3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-    31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
-    56, 57, 58, 59, 60, 61, 62, 63
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+    27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+    51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
 );
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index fc029548ecac8..fac11d66e1b04 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -99,7 +99,7 @@ use crate::simd::{
 // directly constructing an instance of the type (i.e. `let vector = Simd(array)`) should be
 // avoided, as it will likely become illegal on `#[repr(simd)]` structs in the future. It also
 // causes rustc to emit illegal LLVM IR in some cases.
-#[repr(simd)]
+#[repr(simd, packed)]
 pub struct Simd<T, const N: usize>([T; N])
 where
     LaneCount<N>: SupportedLaneCount,
diff --git a/crates/core_simd/tests/layout.rs b/crates/core_simd/tests/layout.rs
new file mode 100644
index 0000000000000..24114c2d261e7
--- /dev/null
+++ b/crates/core_simd/tests/layout.rs
@@ -0,0 +1,35 @@
+#![feature(portable_simd)]
+
+macro_rules! layout_tests {
+    { $($mod:ident, $ty:ty,)* } => {
+        $(
+        mod $mod {
+            test_helpers::test_lanes! {
+                fn no_padding<const LANES: usize>() {
+                    assert_eq!(
+                        core::mem::size_of::<core_simd::simd::Simd::<$ty, LANES>>(),
+                        core::mem::size_of::<[$ty; LANES]>(),
+                    );
+                }
+            }
+        }
+        )*
+    }
+}
+
+layout_tests! {
+    i8, i8,
+    i16, i16,
+    i32, i32,
+    i64, i64,
+    isize, isize,
+    u8, u8,
+    u16, u16,
+    u32, u32,
+    u64, u64,
+    usize, usize,
+    f32, f32,
+    f64, f64,
+    mut_ptr, *mut (),
+    const_ptr, *const (),
+}
diff --git a/crates/core_simd/tests/masks.rs b/crates/core_simd/tests/masks.rs
index 2fe0c97407f03..48786d02440b3 100644
--- a/crates/core_simd/tests/masks.rs
+++ b/crates/core_simd/tests/masks.rs
@@ -99,7 +99,6 @@ macro_rules! test_mask_api {
                 assert_eq!(Mask::<$type, 2>::from_bitmask(bitmask), mask);
             }
 
-            #[cfg(feature = "all_lane_counts")]
             #[test]
             fn roundtrip_bitmask_conversion_odd() {
                 let values = [
diff --git a/crates/test_helpers/Cargo.toml b/crates/test_helpers/Cargo.toml
index 23dae7c93381e..a5359b9abc84d 100644
--- a/crates/test_helpers/Cargo.toml
+++ b/crates/test_helpers/Cargo.toml
@@ -6,6 +6,3 @@ publish = false
 
 [dependencies]
 proptest = { version = "0.10", default-features = false, features = ["alloc"] }
-
-[features]
-all_lane_counts = []
diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index c55099434c8c1..3d70f5ca5fe81 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -539,32 +539,17 @@ macro_rules! test_lanes {
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_1 1;
                     lanes_2 2;
+                    lanes_3 3; // test one non-power-of-2 length on miri
                 );
 
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 $crate::test_lanes_helper!(
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_4 4;
-                    lanes_8 8;
-                    lanes_16 16;
-                    lanes_32 32;
-                    lanes_64 64;
-                );
-
-                #[cfg(feature = "all_lane_counts")]
-                $crate::test_lanes_helper!(
-                    // test one non-power-of-2 length on miri
-                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
-                    lanes_3 3;
-                );
-
-                #[cfg(feature = "all_lane_counts")]
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                $crate::test_lanes_helper!(
-                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_5 5;
                     lanes_6 6;
                     lanes_7 7;
+                    lanes_8 8;
                     lanes_9 9;
                     lanes_10 10;
                     lanes_11 11;
@@ -572,6 +557,7 @@ macro_rules! test_lanes {
                     lanes_13 13;
                     lanes_14 14;
                     lanes_15 15;
+                    lanes_16 16;
                     lanes_17 17;
                     lanes_18 18;
                     lanes_19 19;
@@ -587,6 +573,7 @@ macro_rules! test_lanes {
                     lanes_29 29;
                     lanes_30 30;
                     lanes_31 31;
+                    lanes_32 32;
                     lanes_33 33;
                     lanes_34 34;
                     lanes_35 35;
@@ -618,6 +605,7 @@ macro_rules! test_lanes {
                     lanes_61 61;
                     lanes_62 62;
                     lanes_63 63;
+                    lanes_64 64;
                 );
             }
         )*
@@ -639,36 +627,22 @@ macro_rules! test_lanes_panic {
                     core_simd::simd::LaneCount<$lanes>: core_simd::simd::SupportedLaneCount,
                 $body
 
+                // test some odd and even non-power-of-2 lengths on miri
                 $crate::test_lanes_helper!(
                     #[should_panic];
                     lanes_1 1;
                     lanes_2 2;
-                    lanes_4 4;
-                );
-
-                #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
-                $crate::test_lanes_helper!(
-                    #[should_panic];
-                    lanes_8 8;
-                    lanes_16 16;
-                    lanes_32 32;
-                    lanes_64 64;
-                );
-
-                #[cfg(feature = "all_lane_counts")]
-                $crate::test_lanes_helper!(
-                    // test some odd and even non-power-of-2 lengths on miri
-                    #[should_panic];
                     lanes_3 3;
+                    lanes_4 4;
                     lanes_5 5;
-                    lanes_6 6;
                 );
 
-                #[cfg(feature = "all_lane_counts")]
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 $crate::test_lanes_helper!(
                     #[should_panic];
+                    lanes_6 6;
                     lanes_7 7;
+                    lanes_8 8;
                     lanes_9 9;
                     lanes_10 10;
                     lanes_11 11;
@@ -676,6 +650,7 @@ macro_rules! test_lanes_panic {
                     lanes_13 13;
                     lanes_14 14;
                     lanes_15 15;
+                    lanes_16 16;
                     lanes_17 17;
                     lanes_18 18;
                     lanes_19 19;
@@ -691,6 +666,7 @@ macro_rules! test_lanes_panic {
                     lanes_29 29;
                     lanes_30 30;
                     lanes_31 31;
+                    lanes_32 32;
                     lanes_33 33;
                     lanes_34 34;
                     lanes_35 35;
@@ -722,6 +698,7 @@ macro_rules! test_lanes_panic {
                     lanes_61 61;
                     lanes_62 62;
                     lanes_63 63;
+                    lanes_64 64;
                 );
             }
         )*

From 6e084428ec9493a65f410d789220262d84d22267 Mon Sep 17 00:00:00 2001
From: gstvg <28798827+gstvg@users.noreply.github.com>
Date: Wed, 19 Jun 2024 11:05:44 -0300
Subject: [PATCH 39/74] Fix Mask::all must_use attribute

---
 crates/core_simd/src/masks/full_masks.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/masks/full_masks.rs b/crates/core_simd/src/masks/full_masks.rs
index 0c041b32761bc..2d01946b5747c 100644
--- a/crates/core_simd/src/masks/full_masks.rs
+++ b/crates/core_simd/src/masks/full_masks.rs
@@ -227,7 +227,7 @@ where
     }
 
     #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original value"]
+    #[must_use = "method returns a new bool and does not mutate the original value"]
     pub fn all(self) -> bool {
         // Safety: use `self` as an integer vector
         unsafe { core::intrinsics::simd::simd_reduce_all(self.to_int()) }

From 30a631731de667966a1766af0ec5432c791093c8 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 20 Jun 2024 05:02:12 +1000
Subject: [PATCH 40/74] Convert some module-level `//` and `///` comments to
 `//!`.

This makes their intent and expected location clearer. We see some
examples where these comments were not clearly separate from `use`
declarations, which made it hard to understand what the comment is
describing.
---
 crates/core_simd/examples/dot_product.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/examples/dot_product.rs b/crates/core_simd/examples/dot_product.rs
index f047010a65c16..75d152ae7f0e3 100644
--- a/crates/core_simd/examples/dot_product.rs
+++ b/crates/core_simd/examples/dot_product.rs
@@ -1,6 +1,5 @@
-// Code taken from the `packed_simd` crate
-// Run this code with `cargo test --example dot_product`
-//use std::iter::zip;
+//! Code taken from the `packed_simd` crate.
+//! Run this code with `cargo test --example dot_product`.
 
 #![feature(array_chunks)]
 #![feature(slice_as_chunks)]

From 2937f6f2cca1a72966e8b75650c290405dc8e578 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Thu, 20 Jun 2024 05:04:30 +1000
Subject: [PATCH 41/74] Add blank lines after module-level `//!` comments.

Most modules have such a blank line, but some don't. Inserting the blank
line makes it clearer that the `//!` comments are describing the entire
module, rather than the `use` declaration(s) that immediately follows.
---
 crates/core_simd/src/ops/assign.rs | 1 +
 crates/core_simd/src/ops/deref.rs  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/crates/core_simd/src/ops/assign.rs b/crates/core_simd/src/ops/assign.rs
index 0e87785025a38..d21d867de26d6 100644
--- a/crates/core_simd/src/ops/assign.rs
+++ b/crates/core_simd/src/ops/assign.rs
@@ -1,4 +1,5 @@
 //! Assignment operators
+
 use super::*;
 use core::ops::{AddAssign, MulAssign}; // commutative binary op-assignment
 use core::ops::{BitAndAssign, BitOrAssign, BitXorAssign}; // commutative bit binary op-assignment
diff --git a/crates/core_simd/src/ops/deref.rs b/crates/core_simd/src/ops/deref.rs
index 89a60ba114146..0ff76cfba39bb 100644
--- a/crates/core_simd/src/ops/deref.rs
+++ b/crates/core_simd/src/ops/deref.rs
@@ -2,6 +2,7 @@
 //! Ideally, Rust would take care of this itself,
 //! and method calls usually handle the LHS implicitly.
 //! But this is not the case with arithmetic ops.
+
 use super::*;
 
 macro_rules! deref_lhs {

From b17317663573d4ff7d881e3fa1d9403bbda19769 Mon Sep 17 00:00:00 2001
From: wooden-worm <93303706+wooden-worm@users.noreply.github.com>
Date: Sun, 23 Jun 2024 22:58:30 -0700
Subject: [PATCH 42/74] wasm64 build with target-feature=+simd128,+atomics

---
 crates/core_simd/src/swizzle_dyn.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 8a1079042f076..3b6388d0f2759 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -30,6 +30,8 @@ where
         use core::arch::arm::{uint8x8_t, vtbl1_u8};
         #[cfg(target_arch = "wasm32")]
         use core::arch::wasm32 as wasm;
+        #[cfg(target_arch = "wasm64")]
+        use core::arch::wasm64 as wasm;
         #[cfg(target_arch = "x86")]
         use core::arch::x86;
         #[cfg(target_arch = "x86_64")]

From 048ba3e4b16f97707bacf2baed89e90dc9d6974e Mon Sep 17 00:00:00 2001
From: John Arundel <john@bitfieldconsulting.com>
Date: Mon, 15 Jul 2024 12:26:30 +0100
Subject: [PATCH 43/74] Fix doc nits

Many tiny changes to stdlib doc comments to make them consistent (for example
"Returns foo", rather than "Return foo", per RFC1574), adding missing periods, paragraph
breaks, backticks for monospace style, and other minor nits.

https://github.com/rust-lang/rfcs/blob/master/text/1574-more-api-documentation-conventions.md#appendix-a-full-conventions-text
---
 crates/core_simd/src/masks.rs              | 12 ++++++------
 crates/core_simd/src/simd/ptr/const_ptr.rs |  2 +-
 crates/core_simd/src/simd/ptr/mut_ptr.rs   |  2 +-
 crates/core_simd/src/swizzle.rs            | 10 +++++-----
 crates/core_simd/src/to_bytes.rs           | 14 +++++++-------
 crates/core_simd/src/vector.rs             |  6 +++---
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/crates/core_simd/src/masks.rs b/crates/core_simd/src/masks.rs
index e6e27c76a5e99..04de3a968276d 100644
--- a/crates/core_simd/src/masks.rs
+++ b/crates/core_simd/src/masks.rs
@@ -137,7 +137,7 @@ where
     T: MaskElement,
     LaneCount<N>: SupportedLaneCount,
 {
-    /// Construct a mask by setting all elements to the given value.
+    /// Constructs a mask by setting all elements to the given value.
     #[inline]
     pub fn splat(value: bool) -> Self {
         Self(mask_impl::Mask::splat(value))
@@ -288,7 +288,7 @@ where
         self.0.all()
     }
 
-    /// Create a bitmask from a mask.
+    /// Creates a bitmask from a mask.
     ///
     /// Each bit is set if the corresponding element in the mask is `true`.
     /// If the mask contains more than 64 elements, the bitmask is truncated to the first 64.
@@ -298,7 +298,7 @@ where
         self.0.to_bitmask_integer()
     }
 
-    /// Create a mask from a bitmask.
+    /// Creates a mask from a bitmask.
     ///
     /// For each bit, if it is set, the corresponding element in the mask is set to `true`.
     /// If the mask contains more than 64 elements, the remainder are set to `false`.
@@ -308,7 +308,7 @@ where
         Self(mask_impl::Mask::from_bitmask_integer(bitmask))
     }
 
-    /// Create a bitmask vector from a mask.
+    /// Creates a bitmask vector from a mask.
     ///
     /// Each bit is set if the corresponding element in the mask is `true`.
     /// The remaining bits are unset.
@@ -328,7 +328,7 @@ where
         self.0.to_bitmask_vector()
     }
 
-    /// Create a mask from a bitmask vector.
+    /// Creates a mask from a bitmask vector.
     ///
     /// For each bit, if it is set, the corresponding element in the mask is set to `true`.
     ///
@@ -350,7 +350,7 @@ where
         Self(mask_impl::Mask::from_bitmask_vector(bitmask))
     }
 
-    /// Find the index of the first set element.
+    /// Finds the index of the first set element.
     ///
     /// ```
     /// # #![feature(portable_simd)]
diff --git a/crates/core_simd/src/simd/ptr/const_ptr.rs b/crates/core_simd/src/simd/ptr/const_ptr.rs
index cbffbc564cfed..be635ea640b86 100644
--- a/crates/core_simd/src/simd/ptr/const_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/const_ptr.rs
@@ -54,7 +54,7 @@ pub trait SimdConstPtr: Copy + Sealed {
     /// [`Self::with_exposed_provenance`] and returns the "address" portion.
     fn expose_provenance(self) -> Self::Usize;
 
-    /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
+    /// Converts an address back to a pointer, picking up a previously "exposed" provenance.
     ///
     /// Equivalent to calling [`core::ptr::with_exposed_provenance`] on each element.
     fn with_exposed_provenance(addr: Self::Usize) -> Self;
diff --git a/crates/core_simd/src/simd/ptr/mut_ptr.rs b/crates/core_simd/src/simd/ptr/mut_ptr.rs
index 6bc6ca3ac42dc..f6823a949e32a 100644
--- a/crates/core_simd/src/simd/ptr/mut_ptr.rs
+++ b/crates/core_simd/src/simd/ptr/mut_ptr.rs
@@ -51,7 +51,7 @@ pub trait SimdMutPtr: Copy + Sealed {
     /// [`Self::with_exposed_provenance`] and returns the "address" portion.
     fn expose_provenance(self) -> Self::Usize;
 
-    /// Convert an address back to a pointer, picking up a previously "exposed" provenance.
+    /// Converts an address back to a pointer, picking up a previously "exposed" provenance.
     ///
     /// Equivalent to calling [`core::ptr::with_exposed_provenance_mut`] on each element.
     fn with_exposed_provenance(addr: Self::Usize) -> Self;
diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 71110bb282018..2f4f777b20e29 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -69,12 +69,12 @@ pub macro simd_swizzle {
     }
 }
 
-/// Create a vector from the elements of another vector.
+/// Creates a vector from the elements of another vector.
 pub trait Swizzle<const N: usize> {
     /// Map from the elements of the input vector to the output vector.
     const INDEX: [usize; N];
 
-    /// Create a new vector from the elements of `vector`.
+    /// Creates a new vector from the elements of `vector`.
     ///
     /// Lane `i` of the output is `vector[Self::INDEX[i]]`.
     #[inline]
@@ -109,7 +109,7 @@ pub trait Swizzle<const N: usize> {
         }
     }
 
-    /// Create a new vector from the elements of `first` and `second`.
+    /// Creates a new vector from the elements of `first` and `second`.
     ///
     /// Lane `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
     /// `first` and `second`.
@@ -145,7 +145,7 @@ pub trait Swizzle<const N: usize> {
         }
     }
 
-    /// Create a new mask from the elements of `mask`.
+    /// Creates a new mask from the elements of `mask`.
     ///
     /// Element `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
     /// `first` and `second`.
@@ -161,7 +161,7 @@ pub trait Swizzle<const N: usize> {
         unsafe { Mask::from_int_unchecked(Self::swizzle(mask.to_int())) }
     }
 
-    /// Create a new mask from the elements of `first` and `second`.
+    /// Creates a new mask from the elements of `first` and `second`.
     ///
     /// Element `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
     /// `first` and `second`.
diff --git a/crates/core_simd/src/to_bytes.rs b/crates/core_simd/src/to_bytes.rs
index 222526c4ab30a..4833ea9e11362 100644
--- a/crates/core_simd/src/to_bytes.rs
+++ b/crates/core_simd/src/to_bytes.rs
@@ -10,7 +10,7 @@ mod sealed {
 }
 use sealed::Sealed;
 
-/// Convert SIMD vectors to vectors of bytes
+/// Converts SIMD vectors to vectors of bytes
 pub trait ToBytes: Sealed {
     /// This type, reinterpreted as bytes.
     type Bytes: Copy
@@ -22,26 +22,26 @@ pub trait ToBytes: Sealed {
         + SimdUint<Scalar = u8>
         + 'static;
 
-    /// Return the memory representation of this integer as a byte array in native byte
+    /// Returns the memory representation of this integer as a byte array in native byte
     /// order.
     fn to_ne_bytes(self) -> Self::Bytes;
 
-    /// Return the memory representation of this integer as a byte array in big-endian
+    /// Returns the memory representation of this integer as a byte array in big-endian
     /// (network) byte order.
     fn to_be_bytes(self) -> Self::Bytes;
 
-    /// Return the memory representation of this integer as a byte array in little-endian
+    /// Returns the memory representation of this integer as a byte array in little-endian
     /// byte order.
     fn to_le_bytes(self) -> Self::Bytes;
 
-    /// Create a native endian integer value from its memory representation as a byte array
+    /// Creates a native endian integer value from its memory representation as a byte array
     /// in native endianness.
     fn from_ne_bytes(bytes: Self::Bytes) -> Self;
 
-    /// Create an integer value from its representation as a byte array in big endian.
+    /// Creates an integer value from its representation as a byte array in big endian.
     fn from_be_bytes(bytes: Self::Bytes) -> Self;
 
-    /// Create an integer value from its representation as a byte array in little endian.
+    /// Creates an integer value from its representation as a byte array in little endian.
     fn from_le_bytes(bytes: Self::Bytes) -> Self;
 }
 
diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index 8dbdfc0e1fe03..3e23916914963 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -187,7 +187,7 @@ where
         unsafe { &mut *(self as *mut Self as *mut [T; N]) }
     }
 
-    /// Load a vector from an array of `T`.
+    /// Loads a vector from an array of `T`.
     ///
     /// This function is necessary since `repr(simd)` has padding for non-power-of-2 vectors (at the time of writing).
     /// With padding, `read_unaligned` will read past the end of an array of N elements.
@@ -567,7 +567,7 @@ where
         unsafe { Self::gather_select_ptr(ptrs, enable, or) }
     }
 
-    /// Read elementwise from pointers into a SIMD vector.
+    /// Reads elementwise from pointers into a SIMD vector.
     ///
     /// # Safety
     ///
@@ -808,7 +808,7 @@ where
         }
     }
 
-    /// Write pointers elementwise into a SIMD vector.
+    /// Writes pointers elementwise into a SIMD vector.
     ///
     /// # Safety
     ///

From f33640664ff93f250f8a520b6252ff66625eceec Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 6 Jun 2024 21:09:37 -0400
Subject: [PATCH 44/74] Add aarch64 workarounds

---
 crates/core_simd/src/ops.rs            | 43 ++++++++++++++++++++++++--
 crates/core_simd/src/simd/num/float.rs | 28 +++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index d8e10eeaa1a2a..dffcc1cad778d 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -96,8 +96,47 @@ macro_rules! int_divrem_guard {
                 // Nice base case to make it easy to const-fold away the other branch.
                 $rhs
             };
-            // Safety: $lhs and rhs are vectors
-            unsafe { core::intrinsics::simd::$simd_call($lhs, rhs) }
+
+            // aarch64 fails for arbitrary `v % 0` for non-powers-of-two
+            #[cfg(target_arch = "aarch64")]
+            {
+                const { assert!(Self::LEN <= 64) };
+                if Self::LEN == 1 {
+                    // Safety: $lhs and rhs are vectors
+                    let x: Simd::<_, 1> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<1>(Default::default()), rhs.resize::<1>(Default::default())) };
+                    x.resize(Default::default())
+                } else if Self::LEN <= 2 {
+                    // Safety: $lhs and rhs are vectors
+                    let x: Simd::<_, 2> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<2>(Default::default()), rhs.resize::<2>(Default::default())) };
+                    x.resize(Default::default())
+                } else if Self::LEN <= 4 {
+                    // Safety: $lhs and rhs are vectors
+                    let x: Simd::<_, 4> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<4>(Default::default()), rhs.resize::<4>(Default::default())) };
+                    x.resize(Default::default())
+                } else if Self::LEN <= 8 {
+                    // Safety: $lhs and rhs are vectors
+                    let x: Simd::<_, 8> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<8>(Default::default()), rhs.resize::<8>(Default::default())) };
+                    x.resize(Default::default())
+                } else if Self::LEN <= 16 {
+                    // Safety: $lhs and rhs are vectors
+                    let x: Simd::<_, 16> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<16>(Default::default()), rhs.resize::<16>(Default::default())) };
+                    x.resize(Default::default())
+                } else if Self::LEN <= 32 {
+                    // Safety: $lhs and rhs are vectors
+                    let x: Simd::<_, 32> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<32>(Default::default()), rhs.resize::<32>(Default::default())) };
+                    x.resize(Default::default())
+                } else {
+                    // Safety: $lhs and rhs are vectors
+                    let x: Simd::<_, 64> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<64>(Default::default()), rhs.resize::<64>(Default::default())) };
+                    x.resize(Default::default())
+                }
+            }
+
+            #[cfg(not(target_arch = "aarch64"))]
+            {
+                // Safety: $lhs and rhs are vectors
+                unsafe { core::intrinsics::simd::$simd_call($lhs, rhs) }
+            }
         }
     };
 }
diff --git a/crates/core_simd/src/simd/num/float.rs b/crates/core_simd/src/simd/num/float.rs
index 59e43851ea8da..48bfca32d53e0 100644
--- a/crates/core_simd/src/simd/num/float.rs
+++ b/crates/core_simd/src/simd/num/float.rs
@@ -255,6 +255,7 @@ macro_rules! impl_trait {
             type Bits = Simd<$bits_ty, N>;
             type Cast<T: SimdElement> = Simd<T, N>;
 
+            #[cfg(not(target_arch = "aarch64"))]
             #[inline]
             fn cast<T: SimdCast>(self) -> Self::Cast<T>
             {
@@ -262,6 +263,33 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_as(self) }
             }
 
+            // https://github.com/llvm/llvm-project/issues/94694
+            #[cfg(target_arch = "aarch64")]
+            #[inline]
+            fn cast<T: SimdCast>(self) -> Self::Cast<T>
+            {
+                const { assert!(N <= 64) };
+                if N <= 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64 {
+                    // Safety: supported types are guaranteed by SimdCast
+                    unsafe { core::intrinsics::simd::simd_as(self) }
+                } else if N < 4 {
+                    let x = self.resize::<4>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 8 {
+                    let x = self.resize::<8>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 16 {
+                    let x = self.resize::<16>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else if N < 32 {
+                    let x = self.resize::<32>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                } else {
+                    let x = self.resize::<64>(Default::default()).cast();
+                    x.resize::<N>(x[0])
+                }
+            }
+
             #[inline]
             #[cfg_attr(miri, track_caller)] // even without panics, this helps for Miri backtraces
             unsafe fn to_int_unchecked<I: SimdCast>(self) -> Self::Cast<I>

From 9f7fec87d7aafb046f78c9ac2e15fa7ca6ab95b8 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 7 Aug 2024 00:18:25 -0400
Subject: [PATCH 45/74] Perform aarch64 div/rem as scalar op

---
 crates/core_simd/src/ops.rs | 42 ++++++++-----------------------------
 1 file changed, 9 insertions(+), 33 deletions(-)

diff --git a/crates/core_simd/src/ops.rs b/crates/core_simd/src/ops.rs
index dffcc1cad778d..2aad690d83dbe 100644
--- a/crates/core_simd/src/ops.rs
+++ b/crates/core_simd/src/ops.rs
@@ -77,7 +77,7 @@ macro_rules! int_divrem_guard {
     (   $lhs:ident,
         $rhs:ident,
         {   const PANIC_ZERO: &'static str = $zero:literal;
-            $simd_call:ident
+            $simd_call:ident, $op:tt
         },
         $int:ident ) => {
         if $rhs.simd_eq(Simd::splat(0 as _)).any() {
@@ -97,39 +97,15 @@ macro_rules! int_divrem_guard {
                 $rhs
             };
 
-            // aarch64 fails for arbitrary `v % 0` for non-powers-of-two
+            // aarch64 div fails for arbitrary `v % 0`, mod fails when rhs is MIN, for non-powers-of-two
+            // these operations aren't vectorized on aarch64 anyway
             #[cfg(target_arch = "aarch64")]
             {
-                const { assert!(Self::LEN <= 64) };
-                if Self::LEN == 1 {
-                    // Safety: $lhs and rhs are vectors
-                    let x: Simd::<_, 1> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<1>(Default::default()), rhs.resize::<1>(Default::default())) };
-                    x.resize(Default::default())
-                } else if Self::LEN <= 2 {
-                    // Safety: $lhs and rhs are vectors
-                    let x: Simd::<_, 2> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<2>(Default::default()), rhs.resize::<2>(Default::default())) };
-                    x.resize(Default::default())
-                } else if Self::LEN <= 4 {
-                    // Safety: $lhs and rhs are vectors
-                    let x: Simd::<_, 4> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<4>(Default::default()), rhs.resize::<4>(Default::default())) };
-                    x.resize(Default::default())
-                } else if Self::LEN <= 8 {
-                    // Safety: $lhs and rhs are vectors
-                    let x: Simd::<_, 8> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<8>(Default::default()), rhs.resize::<8>(Default::default())) };
-                    x.resize(Default::default())
-                } else if Self::LEN <= 16 {
-                    // Safety: $lhs and rhs are vectors
-                    let x: Simd::<_, 16> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<16>(Default::default()), rhs.resize::<16>(Default::default())) };
-                    x.resize(Default::default())
-                } else if Self::LEN <= 32 {
-                    // Safety: $lhs and rhs are vectors
-                    let x: Simd::<_, 32> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<32>(Default::default()), rhs.resize::<32>(Default::default())) };
-                    x.resize(Default::default())
-                } else {
-                    // Safety: $lhs and rhs are vectors
-                    let x: Simd::<_, 64> = unsafe { core::intrinsics::simd::$simd_call($lhs.resize::<64>(Default::default()), rhs.resize::<64>(Default::default())) };
-                    x.resize(Default::default())
+                let mut out = Simd::splat(0 as _);
+                for i in 0..Self::LEN {
+                    out[i] = $lhs[i] $op rhs[i];
                 }
+                out
             }
 
             #[cfg(not(target_arch = "aarch64"))]
@@ -244,14 +220,14 @@ for_base_ops! {
     impl Div::div {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to divide by zero";
-            simd_div
+            simd_div, /
         }
     }
 
     impl Rem::rem {
         int_divrem_guard {
             const PANIC_ZERO: &'static str = "attempt to calculate the remainder with a divisor of zero";
-            simd_rem
+            simd_rem, %
         }
     }
 

From a49f77eb38d252e946d60dde3c4801ec8a2e1ac4 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 7 Aug 2024 01:24:30 -0400
Subject: [PATCH 46/74] Swap lanes tested on miri

---
 crates/test_helpers/src/lib.rs | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 3d70f5ca5fe81..a6559de0e765b 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -539,7 +539,9 @@ macro_rules! test_lanes {
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_1 1;
                     lanes_2 2;
-                    lanes_3 3; // test one non-power-of-2 length on miri
+                    lanes_3 3;
+
+                    lanes_6 6;
                 );
 
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
@@ -547,7 +549,7 @@ macro_rules! test_lanes {
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_4 4;
                     lanes_5 5;
-                    lanes_6 6;
+
                     lanes_7 7;
                     lanes_8 8;
                     lanes_9 9;
@@ -633,14 +635,16 @@ macro_rules! test_lanes_panic {
                     lanes_1 1;
                     lanes_2 2;
                     lanes_3 3;
-                    lanes_4 4;
-                    lanes_5 5;
+
+                    lanes_6 6;
                 );
 
                 #[cfg(not(miri))] // Miri intrinsic implementations are uniform and larger tests are sloooow
                 $crate::test_lanes_helper!(
                     #[should_panic];
-                    lanes_6 6;
+                    lanes_4 4;
+                    lanes_5 5;
+
                     lanes_7 7;
                     lanes_8 8;
                     lanes_9 9;

From 751c3b5978b8cbc1ce0c789dd68824f3312f7447 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 7 Aug 2024 21:25:02 -0400
Subject: [PATCH 47/74] Update crates/test_helpers/src/lib.rs

Co-authored-by: Ralf Jung <post@ralfj.de>
---
 crates/test_helpers/src/lib.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index a6559de0e765b..24efa256c30b8 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -539,6 +539,9 @@ macro_rules! test_lanes {
                     #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)];
                     lanes_1 1;
                     lanes_2 2;
+                    // Cover an odd and an even non-power-of-2 length in Miri.
+                    // (Even non-power-of-2 vectors have alignment between element
+                    // and vector size, so we want to cover that case as well.)
                     lanes_3 3;
 
                     lanes_6 6;

From 7f6a981b26ca13f1295fa3fe10b21651a19431b2 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Wed, 7 Aug 2024 23:17:09 -0400
Subject: [PATCH 48/74] Disable testing most lanes to improve CI times

---
 crates/test_helpers/src/lib.rs | 160 ++++++++++++++++-----------------
 1 file changed, 80 insertions(+), 80 deletions(-)

diff --git a/crates/test_helpers/src/lib.rs b/crates/test_helpers/src/lib.rs
index 24efa256c30b8..197c920e11eac 100644
--- a/crates/test_helpers/src/lib.rs
+++ b/crates/test_helpers/src/lib.rs
@@ -564,51 +564,51 @@ macro_rules! test_lanes {
                     lanes_15 15;
                     lanes_16 16;
                     lanes_17 17;
-                    lanes_18 18;
-                    lanes_19 19;
-                    lanes_20 20;
-                    lanes_21 21;
-                    lanes_22 22;
-                    lanes_23 23;
+                    //lanes_18 18;
+                    //lanes_19 19;
+                    //lanes_20 20;
+                    //lanes_21 21;
+                    //lanes_22 22;
+                    //lanes_23 23;
                     lanes_24 24;
-                    lanes_25 25;
-                    lanes_26 26;
-                    lanes_27 27;
-                    lanes_28 28;
-                    lanes_29 29;
-                    lanes_30 30;
-                    lanes_31 31;
+                    //lanes_25 25;
+                    //lanes_26 26;
+                    //lanes_27 27;
+                    //lanes_28 28;
+                    //lanes_29 29;
+                    //lanes_30 30;
+                    //lanes_31 31;
                     lanes_32 32;
-                    lanes_33 33;
-                    lanes_34 34;
-                    lanes_35 35;
-                    lanes_36 36;
-                    lanes_37 37;
-                    lanes_38 38;
-                    lanes_39 39;
-                    lanes_40 40;
-                    lanes_41 41;
-                    lanes_42 42;
-                    lanes_43 43;
-                    lanes_44 44;
-                    lanes_45 45;
-                    lanes_46 46;
+                    //lanes_33 33;
+                    //lanes_34 34;
+                    //lanes_35 35;
+                    //lanes_36 36;
+                    //lanes_37 37;
+                    //lanes_38 38;
+                    //lanes_39 39;
+                    //lanes_40 40;
+                    //lanes_41 41;
+                    //lanes_42 42;
+                    //lanes_43 43;
+                    //lanes_44 44;
+                    //lanes_45 45;
+                    //lanes_46 46;
                     lanes_47 47;
-                    lanes_48 48;
-                    lanes_49 49;
-                    lanes_50 50;
-                    lanes_51 51;
-                    lanes_52 52;
-                    lanes_53 53;
-                    lanes_54 54;
-                    lanes_55 55;
+                    //lanes_48 48;
+                    //lanes_49 49;
+                    //lanes_50 50;
+                    //lanes_51 51;
+                    //lanes_52 52;
+                    //lanes_53 53;
+                    //lanes_54 54;
+                    //lanes_55 55;
                     lanes_56 56;
                     lanes_57 57;
-                    lanes_58 58;
-                    lanes_59 59;
-                    lanes_60 60;
-                    lanes_61 61;
-                    lanes_62 62;
+                    //lanes_58 58;
+                    //lanes_59 59;
+                    //lanes_60 60;
+                    //lanes_61 61;
+                    //lanes_62 62;
                     lanes_63 63;
                     lanes_64 64;
                 );
@@ -659,51 +659,51 @@ macro_rules! test_lanes_panic {
                     lanes_15 15;
                     lanes_16 16;
                     lanes_17 17;
-                    lanes_18 18;
-                    lanes_19 19;
-                    lanes_20 20;
-                    lanes_21 21;
-                    lanes_22 22;
-                    lanes_23 23;
+                    //lanes_18 18;
+                    //lanes_19 19;
+                    //lanes_20 20;
+                    //lanes_21 21;
+                    //lanes_22 22;
+                    //lanes_23 23;
                     lanes_24 24;
-                    lanes_25 25;
-                    lanes_26 26;
-                    lanes_27 27;
-                    lanes_28 28;
-                    lanes_29 29;
-                    lanes_30 30;
-                    lanes_31 31;
+                    //lanes_25 25;
+                    //lanes_26 26;
+                    //lanes_27 27;
+                    //lanes_28 28;
+                    //lanes_29 29;
+                    //lanes_30 30;
+                    //lanes_31 31;
                     lanes_32 32;
-                    lanes_33 33;
-                    lanes_34 34;
-                    lanes_35 35;
-                    lanes_36 36;
-                    lanes_37 37;
-                    lanes_38 38;
-                    lanes_39 39;
-                    lanes_40 40;
-                    lanes_41 41;
-                    lanes_42 42;
-                    lanes_43 43;
-                    lanes_44 44;
-                    lanes_45 45;
-                    lanes_46 46;
+                    //lanes_33 33;
+                    //lanes_34 34;
+                    //lanes_35 35;
+                    //lanes_36 36;
+                    //lanes_37 37;
+                    //lanes_38 38;
+                    //lanes_39 39;
+                    //lanes_40 40;
+                    //lanes_41 41;
+                    //lanes_42 42;
+                    //lanes_43 43;
+                    //lanes_44 44;
+                    //lanes_45 45;
+                    //lanes_46 46;
                     lanes_47 47;
-                    lanes_48 48;
-                    lanes_49 49;
-                    lanes_50 50;
-                    lanes_51 51;
-                    lanes_52 52;
-                    lanes_53 53;
-                    lanes_54 54;
-                    lanes_55 55;
+                    //lanes_48 48;
+                    //lanes_49 49;
+                    //lanes_50 50;
+                    //lanes_51 51;
+                    //lanes_52 52;
+                    //lanes_53 53;
+                    //lanes_54 54;
+                    //lanes_55 55;
                     lanes_56 56;
                     lanes_57 57;
-                    lanes_58 58;
-                    lanes_59 59;
-                    lanes_60 60;
-                    lanes_61 61;
-                    lanes_62 62;
+                    //lanes_58 58;
+                    //lanes_59 59;
+                    //lanes_60 60;
+                    //lanes_61 61;
+                    //lanes_62 62;
                     lanes_63 63;
                     lanes_64 64;
                 );

From 2a3b8ad2238c493fd73d9577f8a5bf84b0a41618 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 8 Aug 2024 00:57:49 -0400
Subject: [PATCH 49/74] Reduce proptest iterations

---
 .github/workflows/ci.yml | 5 +++++
 Cross.toml               | 2 ++
 2 files changed, 7 insertions(+)
 create mode 100644 Cross.toml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 95ba1f2c8c900..8b5213376d88d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,7 @@ on:
 env:
   CARGO_NET_RETRY: 10
   RUSTUP_MAX_RETRIES: 10
+  PROPTEST_CASES: 64
 
 jobs:
   rustfmt:
@@ -181,6 +182,8 @@ jobs:
   cross-tests:
     name: "${{ matrix.target_feature }} on ${{ matrix.target }} (via cross)"
     runs-on: ubuntu-latest
+    env:
+      PROPTEST_CASES: 16
     strategy:
       fail-fast: false
 
@@ -247,6 +250,8 @@ jobs:
 
   miri:
     runs-on: ubuntu-latest
+    env:
+      PROPTEST_CASES: 16
     steps:
       - uses: actions/checkout@v2
       - name: Test (Miri)
diff --git a/Cross.toml b/Cross.toml
new file mode 100644
index 0000000000000..d21e76b92dd1a
--- /dev/null
+++ b/Cross.toml
@@ -0,0 +1,2 @@
+[build.env]
+passthrough = ["PROPTEST_CASES"]

From d7d060a0bfb94fead7b3a59fc67fc154f878a21b Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 10 Aug 2024 00:47:08 -0400
Subject: [PATCH 50/74] Build test dependencies with optimization

---
 Cargo.toml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index d1732aaec2f92..21d4584a9f4d9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,3 +5,9 @@ members = [
     "crates/std_float",
     "crates/test_helpers",
 ]
+
+[profile.test.package."*"]
+opt-level = 2
+
+[profile.test.package.test_helpers]
+opt-level = 2

From c992db6506c3d384b020bc19ff7a24849e96521e Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Sat, 24 Aug 2024 23:03:27 -0400
Subject: [PATCH 51/74] Add `abs_diff` function to `SimdInt` and `SimdUint`
 traits

Implement `abs_diff` for signed and unsigned integer vectors
---
 crates/core_simd/src/simd/num/int.rs  | 27 +++++++++++++++++++++++++--
 crates/core_simd/src/simd/num/uint.rs | 25 ++++++++++++++++++++++++-
 crates/core_simd/tests/ops_macros.rs  | 16 ++++++++++++++++
 3 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/crates/core_simd/src/simd/num/int.rs b/crates/core_simd/src/simd/num/int.rs
index d7598d9ceaf92..5561fe20e61f0 100644
--- a/crates/core_simd/src/simd/num/int.rs
+++ b/crates/core_simd/src/simd/num/int.rs
@@ -1,6 +1,6 @@
 use super::sealed::Sealed;
 use crate::simd::{
-    cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
+    cmp::SimdOrd, cmp::SimdPartialOrd, num::SimdUint, LaneCount, Mask, Simd, SimdCast, SimdElement,
     SupportedLaneCount,
 };
 
@@ -70,11 +70,27 @@ pub trait SimdInt: Copy + Sealed {
     /// # #[cfg(not(feature = "as_crate"))] use core::simd;
     /// # use simd::prelude::*;
     /// use core::i32::{MIN, MAX};
-    /// let xs = Simd::from_array([MIN, MIN +1, -5, 0]);
+    /// let xs = Simd::from_array([MIN, MIN + 1, -5, 0]);
     /// assert_eq!(xs.abs(), Simd::from_array([MIN, MAX, 5, 0]));
     /// ```
     fn abs(self) -> Self;
 
+    /// Lanewise absolute difference.
+    /// Every element becomes the absolute difference of `self` and `second`.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::prelude::*;
+    /// use core::i32::{MIN, MAX};
+    /// let a = Simd::from_array([MIN, MAX, 100, -100]);
+    /// let b = Simd::from_array([MAX, MIN, -80, -120]);
+    /// assert_eq!(a.abs_diff(b), Simd::from_array([u32::MAX, u32::MAX, 180, 20]));
+    /// ```
+    fn abs_diff(self, second: Self) -> Self::Unsigned;
+
     /// Lanewise saturating absolute value, implemented in Rust.
     /// As abs(), except the MIN value becomes MAX instead of itself.
     ///
@@ -259,6 +275,13 @@ macro_rules! impl_trait {
                 (self^m) - m
             }
 
+            #[inline]
+            fn abs_diff(self, second: Self) -> Self::Unsigned {
+                let max = self.simd_max(second);
+                let min = self.simd_min(second);
+                (max - min).cast()
+            }
+
             #[inline]
             fn saturating_abs(self) -> Self {
                 // arith shift for -1 or 0 mask based on sign bit, giving 2s complement
diff --git a/crates/core_simd/src/simd/num/uint.rs b/crates/core_simd/src/simd/num/uint.rs
index 53dd97f501c63..7cc1b5bf8b613 100644
--- a/crates/core_simd/src/simd/num/uint.rs
+++ b/crates/core_simd/src/simd/num/uint.rs
@@ -1,5 +1,5 @@
 use super::sealed::Sealed;
-use crate::simd::{LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
+use crate::simd::{cmp::SimdOrd, LaneCount, Simd, SimdCast, SimdElement, SupportedLaneCount};
 
 /// Operations on SIMD vectors of unsigned integers.
 pub trait SimdUint: Copy + Sealed {
@@ -57,6 +57,22 @@ pub trait SimdUint: Copy + Sealed {
     /// assert_eq!(sat, Simd::splat(0));
     fn saturating_sub(self, second: Self) -> Self;
 
+    /// Lanewise absolute difference.
+    /// Every element becomes the absolute difference of `self` and `second`.
+    ///
+    /// # Examples
+    /// ```
+    /// # #![feature(portable_simd)]
+    /// # #[cfg(feature = "as_crate")] use core_simd::simd;
+    /// # #[cfg(not(feature = "as_crate"))] use core::simd;
+    /// # use simd::prelude::*;
+    /// use core::u32::MAX;
+    /// let a = Simd::from_array([0, MAX, 100, 20]);
+    /// let b = Simd::from_array([MAX, 0, 80, 200]);
+    /// assert_eq!(a.abs_diff(b), Simd::from_array([MAX, MAX, 20, 180]));
+    /// ```
+    fn abs_diff(self, second: Self) -> Self;
+
     /// Returns the sum of the elements of the vector, with wrapping addition.
     fn reduce_sum(self) -> Self::Scalar;
 
@@ -138,6 +154,13 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_saturating_sub(self, second) }
             }
 
+            #[inline]
+            fn abs_diff(self, second: Self) -> Self {
+                let max = self.simd_max(second);
+                let min = self.simd_min(second);
+                max - min
+            }
+
             #[inline]
             fn reduce_sum(self) -> Self::Scalar {
                 // Safety: `self` is an integer vector
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index aa565a137527e..6503cc00053c5 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -307,6 +307,14 @@ macro_rules! impl_signed_tests {
                     assert_eq!(a % b, Vector::<LANES>::splat(0));
                 }
 
+                fn abs_diff<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::abs_diff,
+                        &Scalar::abs_diff,
+                        &|_, _| true,
+                    )
+                }
+
                 fn simd_min<const LANES: usize>() {
                     use core_simd::simd::cmp::SimdOrd;
                     let a = Vector::<LANES>::splat(Scalar::MIN);
@@ -419,6 +427,14 @@ macro_rules! impl_unsigned_tests {
                         &|_| true,
                     );
                 }
+
+                fn abs_diff<const LANES: usize>() {
+                    test_helpers::test_binary_elementwise(
+                        &Vector::<LANES>::abs_diff,
+                        &Scalar::abs_diff,
+                        &|_, _| true,
+                    )
+                }
             }
 
             impl_binary_op_test!(Scalar, Add::add, AddAssign::add_assign, Scalar::wrapping_add);

From d5abbfa9786552ca516574bd4aa44a39665919ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cristi=20V=C3=AEjdea?= <cvijdea@bitdefender.com>
Date: Sun, 25 Aug 2024 23:11:21 +0300
Subject: [PATCH 52/74] Fix avx512vbmi swizzle_dyn implementation

---
 crates/core_simd/src/swizzle_dyn.rs | 30 +++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index 8a1079042f076..eaf297ba3e35d 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -60,12 +60,30 @@ where
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize(avx2_pshufb, self, idxs),
                 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
-                32 => transize(x86::_mm256_permutexvar_epi8, zeroing_idxs(idxs), self),
-                // Notable absence: avx512bw shuffle
-                // If avx512bw is available, odds of avx512vbmi are good
-                // FIXME: initial AVX512VBMI variant didn't actually pass muster
-                // #[cfg(target_feature = "avx512vbmi")]
-                // 64 => transize(x86::_mm512_permutexvar_epi8, self, idxs),
+                32 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm256_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 32>::splat(N as u8).into(),
+                        );
+                        x86::_mm256_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
+                // Notable absence: avx512bw pshufb shuffle
+                #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
+                64 => {
+                    // Unlike vpshufb, vpermb doesn't zero out values in the result based on the index high bit
+                    let swizzler = |bytes, idxs| {
+                        let mask = x86::_mm512_cmp_epu8_mask::<{ x86::_MM_CMPINT_LT }>(
+                            idxs,
+                            Simd::<u8, 64>::splat(N as u8).into(),
+                        );
+                        x86::_mm512_maskz_permutexvar_epi8(mask, idxs, bytes)
+                    };
+                    transize(swizzler, self, idxs)
+                }
                 _ => {
                     let mut array = [0; N];
                     for (i, k) in idxs.to_array().into_iter().enumerate() {

From bbcfdb5cbf201f7122c1665881bb28d78d2ee13b Mon Sep 17 00:00:00 2001
From: gstvg <28798827+gstvg@users.noreply.github.com>
Date: Thu, 29 Aug 2024 01:53:13 -0300
Subject: [PATCH 53/74] fix: swizzle_mask docs

---
 crates/core_simd/src/swizzle.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index a4b6138aa0abb..e0edb2cf10a6d 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -147,8 +147,7 @@ pub trait Swizzle<const N: usize> {
 
     /// Create a new mask from the elements of `mask`.
     ///
-    /// Element `i` of the output is `concat[Self::INDEX[i]]`, where `concat` is the concatenation of
-    /// `first` and `second`.
+    /// Element `i` of the output is `mask[Self::INDEX[i]]`.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original inputs"]
     fn swizzle_mask<T, const M: usize>(mask: Mask<T, M>) -> Mask<T, N>

From c535320c7ec63858bb823ac889a9ae77d646ab2c Mon Sep 17 00:00:00 2001
From: Laiho <emillaiho@hotmail.fi>
Date: Sat, 31 Aug 2024 00:28:39 +0300
Subject: [PATCH 54/74] fix typo in cmp_ne docs

---
 crates/core_simd/src/simd/cmp/eq.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/simd/cmp/eq.rs b/crates/core_simd/src/simd/cmp/eq.rs
index 5b4615ce51d79..93989ce91b89d 100644
--- a/crates/core_simd/src/simd/cmp/eq.rs
+++ b/crates/core_simd/src/simd/cmp/eq.rs
@@ -12,7 +12,7 @@ pub trait SimdPartialEq {
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_eq(self, other: Self) -> Self::Mask;
 
-    /// Test if each element is equal to the corresponding element in `other`.
+    /// Test if each element is not equal to the corresponding element in `other`.
     #[must_use = "method returns a new mask and does not mutate the original value"]
     fn simd_ne(self, other: Self) -> Self::Mask;
 }

From b6222cb8e432d2ade0a5b4ec187d420abe9ae113 Mon Sep 17 00:00:00 2001
From: Trivikram Kamat <16024985+trivikr@users.noreply.github.com>
Date: Mon, 9 Sep 2024 06:46:12 -0700
Subject: [PATCH 55/74] ci: bump actions/checkout to v4

---
 .github/workflows/ci.yml  | 14 +++++++-------
 .github/workflows/doc.yml |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8b5213376d88d..e4a3da359266f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run rustfmt
         run: cargo fmt --all -- --check
 
@@ -46,7 +46,7 @@ jobs:
           - wasm32-unknown-unknown
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Setup Rust
         run: rustup target add ${{ matrix.target }}
       - name: Run Clippy
@@ -84,7 +84,7 @@ jobs:
           # avx512vl, but occasionally doesn't.  Maybe one day we can enable it.
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Setup Rust
         run: rustup target add ${{ matrix.target }}
 
@@ -139,7 +139,7 @@ jobs:
           - aarch64-apple-darwin
           - x86_64-apple-darwin
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Setup Rust
         run: rustup target add ${{ matrix.target }}
 
@@ -167,7 +167,7 @@ jobs:
           - { name: default, RUSTFLAGS: "" }
           - { name: simd128, RUSTFLAGS: "-C target-feature=+simd128" }
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Install wasm-pack
         run: curl https://rustwasm.github.io/wasm-pack/installer/init.sh -sSf | sh
       - name: Test (debug)
@@ -211,7 +211,7 @@ jobs:
           # - { target: riscv64gc-unknown-linux-gnu, target_feature: "+v,+zvl128b" }
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Setup Rust
         run: rustup target add ${{ matrix.target }}
 
@@ -253,6 +253,6 @@ jobs:
     env:
       PROPTEST_CASES: 16
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Test (Miri)
         run: cargo miri test
diff --git a/.github/workflows/doc.yml b/.github/workflows/doc.yml
index 9d1fa66ccb595..22c2cb3f67f1b 100644
--- a/.github/workflows/doc.yml
+++ b/.github/workflows/doc.yml
@@ -12,7 +12,7 @@ jobs:
 
     steps:
       - name: Checkout Repository
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
 
       - name: Setup Rust
         run: |

From 27e2832e87aff7b19b7695d21d347f476fc83da3 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Wed, 11 Sep 2024 17:12:37 +0000
Subject: [PATCH 56/74] Add count_ones() and count_zeros()

Implement on integer types using the simd_ctpop intrinsic.
---
 crates/core_simd/src/simd/num/int.rs  | 16 ++++++++++++++++
 crates/core_simd/src/simd/num/uint.rs | 17 +++++++++++++++++
 crates/core_simd/tests/ops_macros.rs  | 16 ++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/crates/core_simd/src/simd/num/int.rs b/crates/core_simd/src/simd/num/int.rs
index 5561fe20e61f0..3a51235ff954e 100644
--- a/crates/core_simd/src/simd/num/int.rs
+++ b/crates/core_simd/src/simd/num/int.rs
@@ -219,6 +219,12 @@ pub trait SimdInt: Copy + Sealed {
     /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
     fn reverse_bits(self) -> Self;
 
+    /// Returns the number of ones in the binary representation of each element.
+    fn count_ones(self) -> Self::Unsigned;
+
+    /// Returns the number of zeros in the binary representation of each element.
+    fn count_zeros(self) -> Self::Unsigned;
+
     /// Returns the number of leading zeros in the binary representation of each element.
     fn leading_zeros(self) -> Self::Unsigned;
 
@@ -367,6 +373,16 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_bitreverse(self) }
             }
 
+            #[inline]
+            fn count_ones(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().count_ones()
+            }
+
+            #[inline]
+            fn count_zeros(self) -> Self::Unsigned {
+                self.cast::<$unsigned>().count_zeros()
+            }
+
             #[inline]
             fn leading_zeros(self) -> Self::Unsigned {
                 self.cast::<$unsigned>().leading_zeros()
diff --git a/crates/core_simd/src/simd/num/uint.rs b/crates/core_simd/src/simd/num/uint.rs
index 7cc1b5bf8b613..1ab2d8c7b7316 100644
--- a/crates/core_simd/src/simd/num/uint.rs
+++ b/crates/core_simd/src/simd/num/uint.rs
@@ -101,6 +101,12 @@ pub trait SimdUint: Copy + Sealed {
     /// The least significant bit becomes the most significant bit, second least-significant bit becomes second most-significant bit, etc.
     fn reverse_bits(self) -> Self;
 
+    /// Returns the number of ones in the binary representation of each element.
+    fn count_ones(self) -> Self;
+
+    /// Returns the number of zeros in the binary representation of each element.
+    fn count_zeros(self) -> Self;
+
     /// Returns the number of leading zeros in the binary representation of each element.
     fn leading_zeros(self) -> Self;
 
@@ -215,6 +221,17 @@ macro_rules! impl_trait {
                 unsafe { core::intrinsics::simd::simd_bitreverse(self) }
             }
 
+            #[inline]
+            fn count_ones(self) -> Self {
+                // Safety: `self` is an integer vector
+                unsafe { core::intrinsics::simd::simd_ctpop(self) }
+            }
+
+            #[inline]
+            fn count_zeros(self) -> Self {
+                (!self).count_ones()
+            }
+
             #[inline]
             fn leading_zeros(self) -> Self {
                 // Safety: `self` is an integer vector
diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 6503cc00053c5..6e64bfcb424e5 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -216,6 +216,22 @@ macro_rules! impl_common_integer_tests {
                 )
             }
 
+            fn count_ones<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::count_ones,
+                    &|x| x.count_ones() as _,
+                    &|_| true,
+                )
+            }
+
+            fn count_zeros<const LANES: usize>() {
+                test_helpers::test_unary_elementwise(
+                    &$vector::<LANES>::count_zeros,
+                    &|x| x.count_zeros() as _,
+                    &|_| true,
+                )
+            }
+
             fn leading_zeros<const LANES: usize>() {
                 test_helpers::test_unary_elementwise(
                     &$vector::<LANES>::leading_zeros,

From 00c3b6d68e1a50bb9323b13ae5cf1ab3820da56d Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 11 Sep 2024 19:29:51 -0700
Subject: [PATCH 57/74] Use -0.0 as the neutral additive float

-0.0 +  0.0 is  0.0
-0.0 + -0.0 is -0.0

Thus, the float additive-zero is actually -0.0, not its positive cousin.
This change aligns with a recent change to the impl of Sum for floats,
in rust-lang/rust@490818851860fb257e23fe7aa0ee32eaffc4ba40 and
accordingly we also have to use the latest toolchain for our tests now.
---
 crates/core_simd/src/simd/num/float.rs | 2 +-
 rust-toolchain.toml                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/simd/num/float.rs b/crates/core_simd/src/simd/num/float.rs
index 48bfca32d53e0..79954b937b397 100644
--- a/crates/core_simd/src/simd/num/float.rs
+++ b/crates/core_simd/src/simd/num/float.rs
@@ -419,7 +419,7 @@ macro_rules! impl_trait {
                     self.as_array().iter().sum()
                 } else {
                     // Safety: `self` is a float vector
-                    unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, 0.) }
+                    unsafe { core::intrinsics::simd::simd_reduce_add_ordered(self, -0.) }
                 }
             }
 
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 811fdb49cdba8..d6239a040a5b4 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,3 +1,3 @@
 [toolchain]
-channel = "nightly-2024-06-13"
+channel = "nightly-2024-09-11"
 components = ["rustfmt", "clippy", "miri", "rust-src"]

From c080ba539f60f8a3888ddebc52c3db157dd1b2d1 Mon Sep 17 00:00:00 2001
From: Jubilee Young <workingjubilee@gmail.com>
Date: Wed, 11 Sep 2024 20:21:09 -0700
Subject: [PATCH 58/74] Exempt Arm v7 Neon from subnormal-related tests

---
 crates/core_simd/tests/ops_macros.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/crates/core_simd/tests/ops_macros.rs b/crates/core_simd/tests/ops_macros.rs
index 6e64bfcb424e5..6de78f51e59df 100644
--- a/crates/core_simd/tests/ops_macros.rs
+++ b/crates/core_simd/tests/ops_macros.rs
@@ -527,6 +527,9 @@ macro_rules! impl_float_tests {
                 }
 
                 fn is_normal<const LANES: usize>() {
+                    // Arm v7 Neon violates float opsem re: subnormals, see
+                    // https://github.com/rust-lang/portable-simd/issues/439
+                    #[cfg(not(target_arch = "arm"))]
                     test_helpers::test_unary_mask_elementwise(
                         &Vector::<LANES>::is_normal,
                         &Scalar::is_normal,
@@ -535,6 +538,9 @@ macro_rules! impl_float_tests {
                 }
 
                 fn is_subnormal<const LANES: usize>() {
+                    // Arm v7 Neon violates float opsem re: subnormals, see
+                    // https://github.com/rust-lang/portable-simd/issues/439
+                    #[cfg(not(target_arch = "arm"))]
                     test_helpers::test_unary_mask_elementwise(
                         &Vector::<LANES>::is_subnormal,
                         &Scalar::is_subnormal,

From 4111fb2cbd63e98a85436aa163d0f9a2dac5ee3e Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Thu, 12 Sep 2024 11:33:13 +0200
Subject: [PATCH 59/74] simd_shuffle: require index argument to be a vector

---
 crates/core_simd/src/swizzle.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 2f4f777b20e29..d62642fb9061b 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -85,7 +85,7 @@ pub trait Swizzle<const N: usize> {
         LaneCount<N>: SupportedLaneCount,
         LaneCount<M>: SupportedLaneCount,
     {
-        // Safety: `vector` is a vector, and the index is a const array of u32.
+        // Safety: `vector` is a vector, and the index is a const vector of u32.
         unsafe {
             core::intrinsics::simd::simd_shuffle(
                 vector,
@@ -103,7 +103,11 @@ pub trait Swizzle<const N: usize> {
                         output[i] = index as u32;
                         i += 1;
                     }
-                    output
+
+                    // The index list needs to be returned as a vector.
+                    #[repr(simd)]
+                    struct SimdShuffleIdx<const LEN: usize>([u32; LEN]);
+                    SimdShuffleIdx(output)
                 },
             )
         }
@@ -121,7 +125,7 @@ pub trait Swizzle<const N: usize> {
         LaneCount<N>: SupportedLaneCount,
         LaneCount<M>: SupportedLaneCount,
     {
-        // Safety: `first` and `second` are vectors, and the index is a const array of u32.
+        // Safety: `first` and `second` are vectors, and the index is a const vector of u32.
         unsafe {
             core::intrinsics::simd::simd_shuffle(
                 first,
@@ -139,7 +143,11 @@ pub trait Swizzle<const N: usize> {
                         output[i] = index as u32;
                         i += 1;
                     }
-                    output
+
+                    // The index list needs to be returned as a vector.
+                    #[repr(simd)]
+                    struct SimdShuffleIdx<const LEN: usize>([u32; LEN]);
+                    SimdShuffleIdx(output)
                 },
             )
         }

From 24587921617a71fbf09f118f85af68a181948395 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Mon, 23 Sep 2024 21:29:57 +0200
Subject: [PATCH 60/74] stabilize const_intrinsic_copy

---
 crates/core_simd/src/lib.rs | 2 --
 1 file changed, 2 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 331b66262490c..cc6246b4a0d41 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,8 +1,6 @@
 #![no_std]
 #![feature(
-    const_intrinsic_copy,
     const_refs_to_cell,
-    const_maybe_uninit_as_mut_ptr,
     const_mut_refs,
     convert_float_to_int,
     core_intrinsics,

From c7d9ad8c2cae5fcc572fe2f5ed5c88fac25c4a97 Mon Sep 17 00:00:00 2001
From: Samuel Shepard <vfn4@cdc.gov>
Date: Fri, 27 Sep 2024 15:07:28 -0400
Subject: [PATCH 61/74] Add shift_elements_{left,right} for Simd and Masks

---
 crates/core_simd/src/swizzle.rs | 74 +++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index e0edb2cf10a6d..9fa6a7da8d757 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -251,6 +251,56 @@ where
         Rotate::<OFFSET>::swizzle(self)
     }
 
+    /// Shifts the vector elements to the left by `OFFSET`, padding by the
+    /// default value (e.g., zero) to the right.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    fn shift_elements_left<const OFFSET: usize>(self) -> Self
+    where
+        T: Default,
+    {
+        struct Shift<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let mut index = [N; N];
+                let mut i = 0;
+                while i + OFFSET < N {
+                    index[i] = i + OFFSET;
+                    i += 1;
+                }
+                index
+            };
+        }
+
+        Shift::<OFFSET>::concat_swizzle(self, Self::default())
+    }
+
+    /// Shifts the vector elements to the right by `OFFSET`, padding by the
+    /// default value (e.g., zero) from the left.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    fn shift_elements_right<const OFFSET: usize>(self) -> Self
+    where
+        T: Default,
+    {
+        struct Shift<const OFFSET: usize>;
+
+        impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
+            const INDEX: [usize; N] = const {
+                let mut index = [N; N];
+                let mut i = OFFSET;
+                while i < N {
+                    index[i] = i - OFFSET;
+                    i += 1;
+                }
+                index
+            };
+        }
+
+        Shift::<OFFSET>::concat_swizzle(self, Self::default())
+    }
+
     /// Interleave two vectors.
     ///
     /// The resulting vectors contain elements taken alternatively from `self` and `other`, first
@@ -451,6 +501,30 @@ where
         unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_right::<OFFSET>()) }
     }
 
+    /// Shifts the mask elements to the left by `OFFSET`, padding by the
+    /// default value (e.g., zero) to the right.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn shift_elements_left<const OFFSET: usize>(self) -> Self
+    where
+        T: Default,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>()) }
+    }
+
+    /// Shifts the mask elements to the right by `OFFSET`, padding by the
+    /// default value (e.g., `false`) from the left.
+    #[inline]
+    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    pub fn shift_elements_right<const OFFSET: usize>(self) -> Self
+    where
+        T: Default,
+    {
+        // Safety: swizzles are safe for masks
+        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>()) }
+    }
+
     /// Interleave two masks.
     ///
     /// The resulting masks contain elements taken alternatively from `self` and `other`, first

From 55b4b74a120dc83317f143b8a7ce292f6afd34ea Mon Sep 17 00:00:00 2001
From: Samuel Shepard <vfn4@cdc.gov>
Date: Fri, 27 Sep 2024 15:12:58 -0400
Subject: [PATCH 62/74] Add tests, make public

---
 crates/core_simd/src/swizzle.rs   |  4 ++--
 crates/core_simd/tests/swizzle.rs | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 9fa6a7da8d757..cf1e08aa668a2 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -255,7 +255,7 @@ where
     /// default value (e.g., zero) to the right.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    fn shift_elements_left<const OFFSET: usize>(self) -> Self
+    pub fn shift_elements_left<const OFFSET: usize>(self) -> Self
     where
         T: Default,
     {
@@ -280,7 +280,7 @@ where
     /// default value (e.g., zero) from the left.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    fn shift_elements_right<const OFFSET: usize>(self) -> Self
+    pub fn shift_elements_right<const OFFSET: usize>(self) -> Self
     where
         T: Default,
     {
diff --git a/crates/core_simd/tests/swizzle.rs b/crates/core_simd/tests/swizzle.rs
index 522d71439b77d..98045fc5c5443 100644
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@@ -48,6 +48,24 @@ fn rotate() {
     assert_eq!(a.rotate_elements_right::<5>().to_array(), [4, 1, 2, 3]);
 }
 
+#[test]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn shift() {
+    let a = Simd::from_array([1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_left::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_left::<1>().to_array(), [2, 3, 4, 0]);
+    assert_eq!(a.shift_elements_left::<2>().to_array(), [3, 4, 0, 0]);
+    assert_eq!(a.shift_elements_left::<3>().to_array(), [4, 0, 0, 0]);
+    assert_eq!(a.shift_elements_left::<4>().to_array(), [0, 0, 0, 0]);
+    assert_eq!(a.shift_elements_left::<5>().to_array(), [0, 0, 0, 0]);
+    assert_eq!(a.shift_elements_right::<0>().to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_right::<1>().to_array(), [0, 1, 2, 3]);
+    assert_eq!(a.shift_elements_right::<2>().to_array(), [0, 0, 1, 2]);
+    assert_eq!(a.shift_elements_right::<3>().to_array(), [0, 0, 0, 1]);
+    assert_eq!(a.shift_elements_right::<4>().to_array(), [0, 0, 0, 0]);
+    assert_eq!(a.shift_elements_right::<5>().to_array(), [0, 0, 0, 0]);
+}
+
 #[test]
 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
 fn interleave() {

From f5fea5702863eb9ff30b42ee4e8942d9f70eea54 Mon Sep 17 00:00:00 2001
From: Samuel Shepard <vfn4@cdc.gov>
Date: Fri, 27 Sep 2024 19:18:30 -0400
Subject: [PATCH 63/74] Change API to accept a `padding` argument

---
 crates/core_simd/src/swizzle.rs   | 44 +++++++++++--------------------
 crates/core_simd/tests/swizzle.rs | 24 ++++++++---------
 2 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index cf1e08aa668a2..6353196e4cf47 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -251,14 +251,11 @@ where
         Rotate::<OFFSET>::swizzle(self)
     }
 
-    /// Shifts the vector elements to the left by `OFFSET`, padding by the
-    /// default value (e.g., zero) to the right.
+    /// Shifts the vector elements to the left by `OFFSET`, filling in with
+    /// `padding` from the right.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn shift_elements_left<const OFFSET: usize>(self) -> Self
-    where
-        T: Default,
-    {
+    pub fn shift_elements_left<const OFFSET: usize>(self, padding: T) -> Self {
         struct Shift<const OFFSET: usize>;
 
         impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
@@ -273,17 +270,14 @@ where
             };
         }
 
-        Shift::<OFFSET>::concat_swizzle(self, Self::default())
+        Shift::<OFFSET>::concat_swizzle(self, Simd::splat(padding))
     }
 
-    /// Shifts the vector elements to the right by `OFFSET`, padding by the
-    /// default value (e.g., zero) from the left.
+    /// Shifts the vector elements to the right by `OFFSET`, filling in with
+    /// `padding` from the left.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn shift_elements_right<const OFFSET: usize>(self) -> Self
-    where
-        T: Default,
-    {
+    pub fn shift_elements_right<const OFFSET: usize>(self, padding: T) -> Self {
         struct Shift<const OFFSET: usize>;
 
         impl<const OFFSET: usize, const N: usize> Swizzle<N> for Shift<OFFSET> {
@@ -298,7 +292,7 @@ where
             };
         }
 
-        Shift::<OFFSET>::concat_swizzle(self, Self::default())
+        Shift::<OFFSET>::concat_swizzle(self, Simd::splat(padding))
     }
 
     /// Interleave two vectors.
@@ -501,28 +495,22 @@ where
         unsafe { Self::from_int_unchecked(self.to_int().rotate_elements_right::<OFFSET>()) }
     }
 
-    /// Shifts the mask elements to the left by `OFFSET`, padding by the
-    /// default value (e.g., zero) to the right.
+    /// Shifts the mask elements to the left by `OFFSET`, filling in with
+    /// `padding` from the right.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn shift_elements_left<const OFFSET: usize>(self) -> Self
-    where
-        T: Default,
-    {
+    pub fn shift_elements_left<const OFFSET: usize>(self, padding: T) -> Self {
         // Safety: swizzles are safe for masks
-        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>()) }
+        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>(padding)) }
     }
 
-    /// Shifts the mask elements to the right by `OFFSET`, padding by the
-    /// default value (e.g., `false`) from the left.
+    /// Shifts the mask elements to the right by `OFFSET`, filling in with
+    /// `padding` from the left.
     #[inline]
     #[must_use = "method returns a new vector and does not mutate the original inputs"]
-    pub fn shift_elements_right<const OFFSET: usize>(self) -> Self
-    where
-        T: Default,
-    {
+    pub fn shift_elements_right<const OFFSET: usize>(self, padding: T) -> Self {
         // Safety: swizzles are safe for masks
-        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>()) }
+        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>(padding)) }
     }
 
     /// Interleave two masks.
diff --git a/crates/core_simd/tests/swizzle.rs b/crates/core_simd/tests/swizzle.rs
index 98045fc5c5443..7001e5f6bf87b 100644
--- a/crates/core_simd/tests/swizzle.rs
+++ b/crates/core_simd/tests/swizzle.rs
@@ -52,18 +52,18 @@ fn rotate() {
 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
 fn shift() {
     let a = Simd::from_array([1, 2, 3, 4]);
-    assert_eq!(a.shift_elements_left::<0>().to_array(), [1, 2, 3, 4]);
-    assert_eq!(a.shift_elements_left::<1>().to_array(), [2, 3, 4, 0]);
-    assert_eq!(a.shift_elements_left::<2>().to_array(), [3, 4, 0, 0]);
-    assert_eq!(a.shift_elements_left::<3>().to_array(), [4, 0, 0, 0]);
-    assert_eq!(a.shift_elements_left::<4>().to_array(), [0, 0, 0, 0]);
-    assert_eq!(a.shift_elements_left::<5>().to_array(), [0, 0, 0, 0]);
-    assert_eq!(a.shift_elements_right::<0>().to_array(), [1, 2, 3, 4]);
-    assert_eq!(a.shift_elements_right::<1>().to_array(), [0, 1, 2, 3]);
-    assert_eq!(a.shift_elements_right::<2>().to_array(), [0, 0, 1, 2]);
-    assert_eq!(a.shift_elements_right::<3>().to_array(), [0, 0, 0, 1]);
-    assert_eq!(a.shift_elements_right::<4>().to_array(), [0, 0, 0, 0]);
-    assert_eq!(a.shift_elements_right::<5>().to_array(), [0, 0, 0, 0]);
+    assert_eq!(a.shift_elements_left::<0>(0).to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_left::<1>(0).to_array(), [2, 3, 4, 0]);
+    assert_eq!(a.shift_elements_left::<2>(9).to_array(), [3, 4, 9, 9]);
+    assert_eq!(a.shift_elements_left::<3>(8).to_array(), [4, 8, 8, 8]);
+    assert_eq!(a.shift_elements_left::<4>(7).to_array(), [7, 7, 7, 7]);
+    assert_eq!(a.shift_elements_left::<5>(6).to_array(), [6, 6, 6, 6]);
+    assert_eq!(a.shift_elements_right::<0>(0).to_array(), [1, 2, 3, 4]);
+    assert_eq!(a.shift_elements_right::<1>(0).to_array(), [0, 1, 2, 3]);
+    assert_eq!(a.shift_elements_right::<2>(-1).to_array(), [-1, -1, 1, 2]);
+    assert_eq!(a.shift_elements_right::<3>(-2).to_array(), [-2, -2, -2, 1]);
+    assert_eq!(a.shift_elements_right::<4>(-3).to_array(), [-3, -3, -3, -3]);
+    assert_eq!(a.shift_elements_right::<5>(-4).to_array(), [-4, -4, -4, -4]);
 }
 
 #[test]

From 8cff838daa3274c0c4f9bbef67c46587033e267d Mon Sep 17 00:00:00 2001
From: Sam Shepard <sammysheep@users.noreply.github.com>
Date: Fri, 27 Sep 2024 21:04:49 -0400
Subject: [PATCH 64/74] Update crates/core_simd/src/swizzle.rs

Co-authored-by: Caleb Zulawski <caleb.zulawski@gmail.com>
---
 crates/core_simd/src/swizzle.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 6353196e4cf47..a7833ea92c0f2 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -498,7 +498,7 @@ where
     /// Shifts the mask elements to the left by `OFFSET`, filling in with
     /// `padding` from the right.
     #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
     pub fn shift_elements_left<const OFFSET: usize>(self, padding: T) -> Self {
         // Safety: swizzles are safe for masks
         unsafe { Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>(padding)) }

From c9c0bf97f031cc7ab15209cd12b18ab72d019941 Mon Sep 17 00:00:00 2001
From: Sam Shepard <sammysheep@users.noreply.github.com>
Date: Fri, 27 Sep 2024 21:05:07 -0400
Subject: [PATCH 65/74] Update crates/core_simd/src/swizzle.rs

Co-authored-by: Caleb Zulawski <caleb.zulawski@gmail.com>
---
 crates/core_simd/src/swizzle.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index a7833ea92c0f2..3b552016cb59d 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -507,7 +507,7 @@ where
     /// Shifts the mask elements to the right by `OFFSET`, filling in with
     /// `padding` from the left.
     #[inline]
-    #[must_use = "method returns a new vector and does not mutate the original inputs"]
+    #[must_use = "method returns a new mask and does not mutate the original inputs"]
     pub fn shift_elements_right<const OFFSET: usize>(self, padding: T) -> Self {
         // Safety: swizzles are safe for masks
         unsafe { Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>(padding)) }

From 9392fb1c2b73b453a53392ebfd52462100e0c430 Mon Sep 17 00:00:00 2001
From: Samuel Shepard <vfn4@cdc.gov>
Date: Fri, 27 Sep 2024 21:27:47 -0400
Subject: [PATCH 66/74] Change mask function to accept bool

---
 crates/core_simd/src/swizzle.rs | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/crates/core_simd/src/swizzle.rs b/crates/core_simd/src/swizzle.rs
index 3b552016cb59d..dbd84543064e7 100644
--- a/crates/core_simd/src/swizzle.rs
+++ b/crates/core_simd/src/swizzle.rs
@@ -499,18 +499,30 @@ where
     /// `padding` from the right.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original inputs"]
-    pub fn shift_elements_left<const OFFSET: usize>(self, padding: T) -> Self {
+    pub fn shift_elements_left<const OFFSET: usize>(self, padding: bool) -> Self {
         // Safety: swizzles are safe for masks
-        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>(padding)) }
+        unsafe {
+            Self::from_int_unchecked(self.to_int().shift_elements_left::<OFFSET>(if padding {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
     }
 
     /// Shifts the mask elements to the right by `OFFSET`, filling in with
     /// `padding` from the left.
     #[inline]
     #[must_use = "method returns a new mask and does not mutate the original inputs"]
-    pub fn shift_elements_right<const OFFSET: usize>(self, padding: T) -> Self {
+    pub fn shift_elements_right<const OFFSET: usize>(self, padding: bool) -> Self {
         // Safety: swizzles are safe for masks
-        unsafe { Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>(padding)) }
+        unsafe {
+            Self::from_int_unchecked(self.to_int().shift_elements_right::<OFFSET>(if padding {
+                T::TRUE
+            } else {
+                T::FALSE
+            }))
+        }
     }
 
     /// Interleave two masks.

From 6c1656df84004f77d40dfab09f9a3f9f6c20efe7 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 14 Sep 2024 13:44:02 +0200
Subject: [PATCH 67/74] move strict provenance lints to new feature gate,
 remove old feature gates

---
 crates/core_simd/src/lib.rs        | 1 -
 crates/core_simd/tests/pointers.rs | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index cc6246b4a0d41..992a7705e3c52 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -9,7 +9,6 @@
     repr_simd,
     simd_ffi,
     staged_api,
-    strict_provenance,
     prelude_import,
     ptr_metadata
 )]
diff --git a/crates/core_simd/tests/pointers.rs b/crates/core_simd/tests/pointers.rs
index 90bfc5d5fd6a5..d7db4e82b3ca2 100644
--- a/crates/core_simd/tests/pointers.rs
+++ b/crates/core_simd/tests/pointers.rs
@@ -1,4 +1,4 @@
-#![feature(portable_simd, strict_provenance, exposed_provenance)]
+#![feature(portable_simd)]
 
 use core_simd::simd::{
     ptr::{SimdConstPtr, SimdMutPtr},

From 7e162d19dd2e245dbba0e37fd12fe2cdaafdfed8 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 23 Oct 2024 00:42:29 +0200
Subject: [PATCH 68/74] rust-lang/portable-simd#443: Add armv7 neon
 mplementation for  `Simd<u8, 16>::swizzle_dyn`

Use arm neon intrinsics to swizzle two u8x8 blocks with a u8x8x2 lookup table.
---
 crates/core_simd/src/swizzle_dyn.rs | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/crates/core_simd/src/swizzle_dyn.rs b/crates/core_simd/src/swizzle_dyn.rs
index eaf297ba3e35d..0619404e5f705 100644
--- a/crates/core_simd/src/swizzle_dyn.rs
+++ b/crates/core_simd/src/swizzle_dyn.rs
@@ -57,6 +57,13 @@ where
                     target_endian = "little"
                 ))]
                 16 => transize(vqtbl1q_u8, self, idxs),
+                #[cfg(all(
+                    target_arch = "arm",
+                    target_feature = "v7",
+                    target_feature = "neon",
+                    target_endian = "little"
+                ))]
+                16 => transize(armv7_neon_swizzle_u8x16, self, idxs),
                 #[cfg(all(target_feature = "avx2", not(target_feature = "avx512vbmi")))]
                 32 => transize(avx2_pshufb, self, idxs),
                 #[cfg(all(target_feature = "avx512vl", target_feature = "avx512vbmi"))]
@@ -98,6 +105,28 @@ where
     }
 }
 
+/// armv7 neon supports swizzling `u8x16` by swizzling two u8x8 blocks
+/// with a u8x8x2 lookup table.
+///
+/// # Safety
+/// This requires armv7 neon to work
+#[cfg(all(
+    target_arch = "arm",
+    target_feature = "v7",
+    target_feature = "neon",
+    target_endian = "little"
+))]
+unsafe fn armv7_neon_swizzle_u8x16(bytes: Simd<u8, 16>, idxs: Simd<u8, 16>) -> Simd<u8, 16> {
+    use core::arch::arm::{uint8x8x2_t, vcombine_u8, vget_high_u8, vget_low_u8, vtbl2_u8};
+    // SAFETY: Caller promised arm neon support
+    unsafe {
+        let bytes = uint8x8x2_t(vget_low_u8(bytes.into()), vget_high_u8(bytes.into()));
+        let lo = vtbl2_u8(bytes, vget_low_u8(idxs.into()));
+        let hi = vtbl2_u8(bytes, vget_high_u8(idxs.into()));
+        vcombine_u8(lo, hi).into()
+    }
+}
+
 /// "vpshufb like it was meant to be" on AVX2
 ///
 /// # Safety

From 35ac70a47830ba1c0590bc55523b70951ba024d3 Mon Sep 17 00:00:00 2001
From: WANG Rui <wangrui@loongson.cn>
Date: Sat, 2 Nov 2024 10:43:43 +0800
Subject: [PATCH 69/74] ci: add support for loongarch64-unknown-linux-gnu

---
 .github/workflows/ci.yml | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e4a3da359266f..3984d8f0d8d99 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -35,6 +35,7 @@ jobs:
           - aarch64-unknown-linux-gnu
           - arm64ec-pc-windows-msvc
           - armv7-unknown-linux-gnueabihf
+          - loongarch64-unknown-linux-gnu
           # non-nightly since https://github.com/rust-lang/rust/pull/113274
           # - mips-unknown-linux-gnu
           # - mips64-unknown-linux-gnuabi64
@@ -195,6 +196,7 @@ jobs:
           - powerpc-unknown-linux-gnu
           - powerpc64le-unknown-linux-gnu       # includes altivec by default
           - riscv64gc-unknown-linux-gnu
+          - loongarch64-unknown-linux-gnu
           # MIPS uses a nonstandard binary representation for NaNs which makes it worth testing
           # non-nightly since https://github.com/rust-lang/rust/pull/113274
           # - mips-unknown-linux-gnu
@@ -216,15 +218,9 @@ jobs:
         run: rustup target add ${{ matrix.target }}
 
       - name: Install Cross
-        # Equivalent to `cargo install cross`, but downloading a prebuilt
-        # binary. Ideally we wouldn't hardcode a version, but the version number
-        # being part of the tarball means we can't just use the download/latest
-        # URL :(
+        # Install the latest git version for newer targets.
         run: |
-          CROSS_URL=https://github.com/cross-rs/cross/releases/download/v0.2.5/cross-x86_64-unknown-linux-gnu.tar.gz
-          mkdir -p "$HOME/.bin"
-          curl -sfSL --retry-delay 10 --retry 5 "${CROSS_URL}" | tar zxf - -C "$HOME/.bin"
-          echo "$HOME/.bin" >> $GITHUB_PATH
+          cargo install cross --git https://github.com/cross-rs/cross --rev 4090beca3cfffa44371a5bba524de3a578aa46c3
 
       - name: Configure Emulated CPUs
         run: |

From f6a227690ee55d77f253c87d4f312879ae578596 Mon Sep 17 00:00:00 2001
From: AquaEBM <AquaEBM@gmail.com>
Date: Mon, 25 Nov 2024 12:44:26 +0100
Subject: [PATCH 70/74] add rustc_const_unstable attribute to Simd::splat

---
 crates/core_simd/src/vector.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/core_simd/src/vector.rs b/crates/core_simd/src/vector.rs
index fac11d66e1b04..6518927db61d2 100644
--- a/crates/core_simd/src/vector.rs
+++ b/crates/core_simd/src/vector.rs
@@ -144,6 +144,7 @@ where
     /// assert_eq!(v.as_array(), &[8, 8, 8, 8]);
     /// ```
     #[inline]
+    #[rustc_const_unstable(feature = "portable_simd", issue = "86656")]
     pub const fn splat(value: T) -> Self {
         const fn splat_const<T, const N: usize>(value: T) -> Simd<T, N>
         where

From 8423171f11c1dc494779bea6f93d63acb876d402 Mon Sep 17 00:00:00 2001
From: daxpedda <daxpedda@gmail.com>
Date: Sun, 15 Dec 2024 09:59:15 +0100
Subject: [PATCH 71/74] Bump `stdarch`

---
 crates/core_simd/src/vendor/arm.rs | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/crates/core_simd/src/vendor/arm.rs b/crates/core_simd/src/vendor/arm.rs
index f8878d11f094d..3dc54481b6fd4 100644
--- a/crates/core_simd/src/vendor/arm.rs
+++ b/crates/core_simd/src/vendor/arm.rs
@@ -48,17 +48,6 @@ mod neon {
     from_transmute! { unsafe u64x2 => poly64x2_t }
 }
 
-#[cfg(any(
-    all(target_feature = "v5te", not(target_feature = "mclass")),
-    all(target_feature = "mclass", target_feature = "dsp"),
-))]
-mod dsp {
-    use super::*;
-
-    from_transmute! { unsafe Simd<u16, 2> => uint16x2_t }
-    from_transmute! { unsafe Simd<i16, 2> => int16x2_t }
-}
-
 #[cfg(any(
     all(target_feature = "v6", not(target_feature = "mclass")),
     all(target_feature = "mclass", target_feature = "dsp"),
@@ -68,6 +57,8 @@ mod simd32 {
 
     from_transmute! { unsafe Simd<u8, 4> => uint8x4_t }
     from_transmute! { unsafe Simd<i8, 4> => int8x4_t }
+    from_transmute! { unsafe Simd<u16, 2> => uint16x2_t }
+    from_transmute! { unsafe Simd<i16, 2> => int16x2_t }
 }
 
 #[cfg(all(

From 638667a135fd84f89a11e34db5ad13ed20593275 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Thu, 16 Jan 2025 01:41:34 -0500
Subject: [PATCH 72/74] Remove stable features

---
 crates/core_simd/src/lib.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/crates/core_simd/src/lib.rs b/crates/core_simd/src/lib.rs
index 3326341921bd9..7f57847c9c234 100644
--- a/crates/core_simd/src/lib.rs
+++ b/crates/core_simd/src/lib.rs
@@ -1,9 +1,6 @@
 #![no_std]
 #![feature(
     const_eval_select,
-    const_intrinsic_copy,
-    const_refs_to_cell,
-    const_mut_refs,
     convert_float_to_int,
     core_intrinsics,
     decl_macro,

From 15c19722de927ef1e44cb9b07cc5a44efc4e0654 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 18 Jan 2025 15:15:27 -0500
Subject: [PATCH 73/74] Add script for syncing subtree

---
 .gitignore      |  1 +
 subtree-sync.sh | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)
 create mode 100755 subtree-sync.sh

diff --git a/.gitignore b/.gitignore
index ea8c4bf7f35f6..9673e52dcadba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 /target
+git-subtree.sh
diff --git a/subtree-sync.sh b/subtree-sync.sh
new file mode 100755
index 0000000000000..18360077623b1
--- /dev/null
+++ b/subtree-sync.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+set -eou pipefail
+
+git fetch origin
+pushd $2
+git fetch origin
+popd
+
+if [ "$(git rev-parse --show-prefix)" != "" ]; then
+    echo "Run this script from the git root" >&2
+    exit 1
+fi
+
+if [ "$(git rev-parse HEAD)" != "$(git rev-parse origin/master)" ]; then
+    echo "$(pwd) is not at origin/master" >&2
+    exit 1
+fi
+
+if [ ! -f library/portable-simd/git-subtree.sh ]; then
+    curl -sS https://raw.githubusercontent.com/bjorn3/git/tqc-subtree-portable/contrib/subtree/git-subtree.sh -o library/portable-simd/git-subtree.sh
+    chmod +x library/portable-simd/git-subtree.sh
+fi
+
+today=$(date +%Y-%m-%d)
+
+case $1 in
+    "push")
+        upstream=rust-upstream-$today
+        merge=sync-from-rust-$today
+
+        pushd $2
+        git checkout master
+        git pull
+        popd
+
+        library/portable-simd/git-subtree.sh push -P library/portable-simd $2 $upstream
+
+        pushd $2
+        git checkout -B $merge origin/master
+        git merge $upstream
+        popd
+        echo "Branch \`$merge\` created in \`$2\`. You may need to resolve merge conflicts."
+        ;;
+    "pull")
+        branch=sync-from-portable-simd-$today
+
+        git checkout -B $branch
+        echo "Creating branch \`$branch\`... You may need to resolve merge conflicts."
+        library/portable-simd/git-subtree.sh pull -P library/portable-simd $2 origin/master
+        ;;
+esac

From 52b42d71876d509340c432671c63217427eef165 Mon Sep 17 00:00:00 2001
From: Caleb Zulawski <caleb.zulawski@gmail.com>
Date: Sat, 18 Jan 2025 21:44:41 -0500
Subject: [PATCH 74/74] Update tests for std::simd subtree sync

---
 .../tests/pass/intrinsics/portable-simd.rs    | 21 -------------------
 ...n.DataflowConstProp.32bit.panic-abort.diff |  2 +-
 ....DataflowConstProp.32bit.panic-unwind.diff |  2 +-
 ...n.DataflowConstProp.64bit.panic-abort.diff |  2 +-
 ....DataflowConstProp.64bit.panic-unwind.diff |  2 +-
 ...oxed_slice.main.GVN.32bit.panic-abort.diff |  2 +-
 ...xed_slice.main.GVN.32bit.panic-unwind.diff |  2 +-
 ...oxed_slice.main.GVN.64bit.panic-abort.diff |  2 +-
 ...xed_slice.main.GVN.64bit.panic-unwind.diff |  2 +-
 .../gvn_ptr_eq_with_constant.main.GVN.diff    |  2 +-
 ...ated_loop.PreCodegen.after.panic-abort.mir |  2 +-
 ...ted_loop.PreCodegen.after.panic-unwind.mir |  2 +-
 ...ward_loop.PreCodegen.after.panic-abort.mir |  2 +-
 ...ard_loop.PreCodegen.after.panic-unwind.mir |  2 +-
 ...erse_loop.PreCodegen.after.panic-abort.mir |  2 +-
 ...rse_loop.PreCodegen.after.panic-unwind.mir |  2 +-
 16 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/src/tools/miri/tests/pass/intrinsics/portable-simd.rs b/src/tools/miri/tests/pass/intrinsics/portable-simd.rs
index acd3502f5289d..a61f29775c1f4 100644
--- a/src/tools/miri/tests/pass/intrinsics/portable-simd.rs
+++ b/src/tools/miri/tests/pass/intrinsics/portable-simd.rs
@@ -300,27 +300,6 @@ fn simd_mask() {
         }
     }
 
-    // This used to cause an ICE. It exercises simd_select_bitmask with an array as input.
-    let bitmask = u8x4::from_array([0b00001101, 0, 0, 0]);
-    assert_eq!(
-        mask32x4::from_bitmask_vector(bitmask),
-        mask32x4::from_array([true, false, true, true]),
-    );
-    let bitmask = u8x8::from_array([0b01000101, 0, 0, 0, 0, 0, 0, 0]);
-    assert_eq!(
-        mask32x8::from_bitmask_vector(bitmask),
-        mask32x8::from_array([true, false, true, false, false, false, true, false]),
-    );
-    let bitmask =
-        u8x16::from_array([0b01000101, 0b11110000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
-    assert_eq!(
-        mask32x16::from_bitmask_vector(bitmask),
-        mask32x16::from_array([
-            true, false, true, false, false, false, true, false, false, false, false, false, true,
-            true, true, true,
-        ]),
-    );
-
     // Also directly call simd_select_bitmask, to test both kinds of argument types.
     unsafe {
         // These masks are exactly the results we got out above in the `simd_bitmask` tests.
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-abort.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-abort.diff
index 5a830254f6199..2c89670dcf7d7 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-abort.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-abort.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-unwind.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-unwind.diff
index c11368a347c57..8fecfe224cc69 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-unwind.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.32bit.panic-unwind.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-abort.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-abort.diff
index 037ed02ce6556..976ea252c2f89 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-abort.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-abort.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-unwind.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-unwind.diff
index 86351c7875933..6c59f5e3e2e86 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-unwind.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.DataflowConstProp.64bit.panic-unwind.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-abort.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-abort.diff
index 20a3897a934f0..1f9cf6d6aca83 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-abort.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-abort.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-unwind.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-unwind.diff
index 2e396301fd0ec..a8760285fac11 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-unwind.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.32bit.panic-unwind.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-abort.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-abort.diff
index 319691174cf66..c398ae70a1a3e 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-abort.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-abort.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-unwind.diff b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-unwind.diff
index 5dafc89d53f29..02934c02587d2 100644
--- a/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-unwind.diff
+++ b/tests/mir-opt/dataflow-const-prop/default_boxed_slice.main.GVN.64bit.panic-unwind.diff
@@ -26,7 +26,7 @@
                           }
                           scope 11 (inlined NonZero::<usize>::get) {
                           }
-                          scope 12 (inlined without_provenance::<[bool; 0]>) {
+                          scope 12 (inlined std::ptr::without_provenance::<[bool; 0]>) {
                               scope 13 (inlined without_provenance_mut::<[bool; 0]>) {
                               }
                           }
diff --git a/tests/mir-opt/gvn_ptr_eq_with_constant.main.GVN.diff b/tests/mir-opt/gvn_ptr_eq_with_constant.main.GVN.diff
index 8e7964297d060..f56af33ea603f 100644
--- a/tests/mir-opt/gvn_ptr_eq_with_constant.main.GVN.diff
+++ b/tests/mir-opt/gvn_ptr_eq_with_constant.main.GVN.diff
@@ -16,7 +16,7 @@
                       }
                       scope 8 (inlined NonZero::<usize>::get) {
                       }
-                      scope 9 (inlined without_provenance::<u8>) {
+                      scope 9 (inlined std::ptr::without_provenance::<u8>) {
                           scope 10 (inlined without_provenance_mut::<u8>) {
                           }
                       }
diff --git a/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-abort.mir b/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-abort.mir
index 496ec78fd8d3e..b7a9b4a1fe01c 100644
--- a/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-abort.mir
+++ b/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-abort.mir
@@ -59,7 +59,7 @@ fn enumerated_loop(_1: &[T], _2: impl Fn(usize, &T)) -> () {
                     let _9: *const T;
                     scope 7 {
                     }
-                    scope 12 (inlined without_provenance::<T>) {
+                    scope 12 (inlined std::ptr::without_provenance::<T>) {
                         scope 13 (inlined without_provenance_mut::<T>) {
                         }
                     }
diff --git a/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-unwind.mir b/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-unwind.mir
index c4547cb888fab..33dbf04d028db 100644
--- a/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-unwind.mir
+++ b/tests/mir-opt/pre-codegen/slice_iter.enumerated_loop.PreCodegen.after.panic-unwind.mir
@@ -34,7 +34,7 @@ fn enumerated_loop(_1: &[T], _2: impl Fn(usize, &T)) -> () {
                     let _9: *const T;
                     scope 7 {
                     }
-                    scope 12 (inlined without_provenance::<T>) {
+                    scope 12 (inlined std::ptr::without_provenance::<T>) {
                         scope 13 (inlined without_provenance_mut::<T>) {
                         }
                     }
diff --git a/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-abort.mir b/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-abort.mir
index 7d011ea3347f3..dc13bb23c310a 100644
--- a/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-abort.mir
+++ b/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-abort.mir
@@ -31,7 +31,7 @@ fn forward_loop(_1: &[T], _2: impl Fn(&T)) -> () {
                     let _9: *const T;
                     scope 7 {
                     }
-                    scope 12 (inlined without_provenance::<T>) {
+                    scope 12 (inlined std::ptr::without_provenance::<T>) {
                         scope 13 (inlined without_provenance_mut::<T>) {
                         }
                     }
diff --git a/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-unwind.mir b/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-unwind.mir
index 75e6542a3a4b0..3f1e0e0f746b4 100644
--- a/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-unwind.mir
+++ b/tests/mir-opt/pre-codegen/slice_iter.forward_loop.PreCodegen.after.panic-unwind.mir
@@ -31,7 +31,7 @@ fn forward_loop(_1: &[T], _2: impl Fn(&T)) -> () {
                     let _9: *const T;
                     scope 7 {
                     }
-                    scope 12 (inlined without_provenance::<T>) {
+                    scope 12 (inlined std::ptr::without_provenance::<T>) {
                         scope 13 (inlined without_provenance_mut::<T>) {
                         }
                     }
diff --git a/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-abort.mir b/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-abort.mir
index 41bc91ab028dc..4b7ab4516d25d 100644
--- a/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-abort.mir
+++ b/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-abort.mir
@@ -34,7 +34,7 @@ fn reverse_loop(_1: &[T], _2: impl Fn(&T)) -> () {
                     let _9: *const T;
                     scope 7 {
                     }
-                    scope 12 (inlined without_provenance::<T>) {
+                    scope 12 (inlined std::ptr::without_provenance::<T>) {
                         scope 13 (inlined without_provenance_mut::<T>) {
                         }
                     }
diff --git a/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-unwind.mir b/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-unwind.mir
index 6ed8ef9715bb5..b2c15247cd7be 100644
--- a/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-unwind.mir
+++ b/tests/mir-opt/pre-codegen/slice_iter.reverse_loop.PreCodegen.after.panic-unwind.mir
@@ -34,7 +34,7 @@ fn reverse_loop(_1: &[T], _2: impl Fn(&T)) -> () {
                     let _9: *const T;
                     scope 7 {
                     }
-                    scope 12 (inlined without_provenance::<T>) {
+                    scope 12 (inlined std::ptr::without_provenance::<T>) {
                         scope 13 (inlined without_provenance_mut::<T>) {
                         }
                     }