diff --git a/Cargo.toml b/Cargo.toml index a98cb6a9a..08deb9fa3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,11 @@ default = ["std"] # implementation uses C intrinsics and requires a C compiler. neon = [] +# The RVV implementation does not participate in dynamic feature detection, +# which is currently x86-only. If "rvv" is on, RVV support is assumed. The +# RVV implementation uses C intrinsics and requires a C compiler. +rvv = [] + # This crate uses libstd for std::io trait implementations, and also for # runtime CPU feature detection. This feature is enabled by default. If you use # --no-default-features, the only way to use the SIMD implementations in this diff --git a/benches/bench.rs b/benches/bench.rs index 5efb9e6f5..780e5b638 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -85,6 +85,14 @@ fn bench_single_compression_avx512(b: &mut Bencher) { } } +#[bench] +#[cfg(blake3_rvv)] +fn bench_single_compression_avx512(b: &mut Bencher) { + if let Some(platform) = Platform::rvv() { + bench_single_compression_fn(b, platform); + } +} + fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); let mut inputs = Vec::new(); @@ -151,6 +159,14 @@ fn bench_many_chunks_neon(b: &mut Bencher) { } } +#[bench] +#[cfg(feature = "rvv")] +fn bench_many_chunks_neon(b: &mut Bencher) { + if let Some(platform) = Platform::rvv() { + bench_many_chunks_fn(b, platform); + } +} + // TODO: When we get const generics we can unify this with the chunks code. fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) { let degree = platform.simd_degree(); @@ -218,6 +234,14 @@ fn bench_many_parents_neon(b: &mut Bencher) { } } +#[bench] +#[cfg(feature = "rvv")] +fn bench_many_parents_rvv(b: &mut Bencher) { + if let Some(platform) = Platform::rvv() { + bench_many_parents_fn(b, platform); + } +} + fn bench_atonce(b: &mut Bencher, len: usize) { let mut input = RandomInput::new(b, len); b.iter(|| blake3::hash(input.get())); diff --git a/build.rs b/build.rs index a5dfd0625..2985ea28d 100644 --- a/build.rs +++ b/build.rs @@ -1,3 +1,4 @@ +use core::panic; use std::env; fn defined(var: &str) -> bool { @@ -21,6 +22,14 @@ fn is_no_neon() -> bool { defined("CARGO_FEATURE_NO_NEON") } +fn is_rvv() -> bool { + cfg!(feature = "rvv") +} + +fn is_no_rvv() -> bool { + cfg!(not(feature = "rvv")) +} + fn is_ci() -> bool { defined("BLAKE3_CI") } @@ -60,6 +69,18 @@ fn is_armv7() -> bool { target_components()[0] == "armv7" } +fn is_riscv32() -> bool { + std::env::var("CARGO_CFG_TARGET_ARCH") + .map(|target_arch| target_arch == "riscv32") + .unwrap_or_default() +} + +fn is_riscv64() -> bool { + std::env::var("CARGO_CFG_TARGET_ARCH") + .map(|target_arch| target_arch == "riscv64") + .unwrap_or_default() +} + fn endianness() -> String { let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap(); assert!(endianness == "little" || endianness == "big"); @@ -239,15 +260,33 @@ fn build_neon_c_intrinsics() { build.compile("blake3_neon"); } +fn build_rvv_c_intrinsics() { + let mut build = new_build(); + build.file("c/blake3_rvv.c"); + if is_riscv32() { + build.flag("-march=rv32gcv1p0"); + } + if is_riscv64() { + build.flag("-march=rv64gcv1p0"); + } + build.compile("blake3_rvv"); +} + fn main() -> Result<(), Box> { if is_pure() && is_neon() { panic!("It doesn't make sense to enable both \"pure\" and \"neon\"."); } - if is_no_neon() && is_neon() { panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\"."); } + if is_pure() && is_rvv() { + panic!("It doesn't make sense to enable both \"pure\" and \"rvv\"."); + } + if is_no_rvv() && is_rvv() { + panic!("It doesn't make sense to enable both \"no_rvv\" and \"rvv\"."); + } + if is_x86_64() || is_x86_32() { let support = c_compiler_support(); if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler { @@ -278,6 +317,11 @@ fn main() -> Result<(), Box> { build_neon_c_intrinsics(); } + if (is_riscv32() || is_riscv64()) && is_rvv() { + println!("cargo:rustc-cfg=blake3_rvv"); + build_rvv_c_intrinsics(); + } + // The `cc` crate doesn't automatically emit rerun-if directives for the // environment variables it supports, in particular for $CC. We expect to // do a lot of benchmarking across different compilers, so we explicitly diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 5cde373d9..825c3583f 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -33,6 +33,7 @@ include(GNUInstallDirs) set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64) set(BLAKE3_X86_NAMES i686 x86 X86) set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a) +set(BLAKE3_RISCV64_NAMES riscv64) # default SIMD compiler flag configuration (can be overriden by toolchains or CLI) if(MSVC) set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2") @@ -48,6 +49,7 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU" set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1") set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2") set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512") + set(BLAKE3_CFLAGS_RVV_RISCV64 "-march=rv64gcv1p0" CACHE STRING "the compiler flags to enable RVV for riscv64") if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) @@ -95,8 +97,10 @@ macro(BLAKE3_DISABLE_SIMD) set(BLAKE3_SIMD_AMD64_ASM OFF) set(BLAKE3_SIMD_X86_INTRINSICS OFF) set(BLAKE3_SIMD_NEON_INTRINSICS OFF) + set(BLAKE3_VLA_RVV_INTRINSICS OFF) target_compile_definitions(blake3 PRIVATE BLAKE3_USE_NEON=0 + BLAKE3_USE_RVV BLAKE3_NO_SSE2 BLAKE3_NO_SSE41 BLAKE3_NO_AVX2 @@ -179,6 +183,13 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}") endif() +elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_RISCV64_NAMES + OR BLAKE3_USE_RVV_INTRINSICS)) + set(BLAKE3_VLA_RVV_INTRINSICS ON) + target_sources(blake3 PRIVATE + blake3_rvv.c + ) + set_source_files_properties(blake3_rvv.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_RVV_RISCV64}") else() BLAKE3_DISABLE_SIMD() endif() @@ -216,4 +227,5 @@ install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc" add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.") add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.") add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.") +add_feature_info("RISC-V RVV intrinsics" BLAKE3_VLA_RVV_INTRINSICS "The library uses RISC-V RVV intrinsics.") feature_summary(WHAT ENABLED_FEATURES) diff --git a/c/blake3_c_rust_bindings/src/test.rs b/c/blake3_c_rust_bindings/src/test.rs index 0730d9306..f7f751203 100644 --- a/c/blake3_c_rust_bindings/src/test.rs +++ b/c/blake3_c_rust_bindings/src/test.rs @@ -192,6 +192,15 @@ fn test_compress_avx512() { ); } +#[test] +#[cfg(feature = "rvv")] +fn test_compress_rvv() { + test_compress_fn( + crate::ffi::x86::blake3_compress_in_place_rvv, + crate::ffi::x86::blake3_compress_xof_rvv, + ); +} + type HashManyFn = unsafe extern "C" fn( inputs: *const *const u8, num_inputs: usize, @@ -359,6 +368,12 @@ fn test_hash_many_neon() { test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon); } +#[test] +#[cfg(feature = "rvv")] +fn test_hash_many_rvv() { + test_hash_many_fn(crate::ffi::rvv::blake3_hash_many_rvv); +} + #[test] fn test_compare_reference_impl() { const OUT: usize = 303; // more than 64, not a multiple of 4 diff --git a/c/blake3_dispatch.c b/c/blake3_dispatch.c index af6c3dadc..9149fa378 100644 --- a/c/blake3_dispatch.c +++ b/c/blake3_dispatch.c @@ -267,6 +267,11 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, return; #endif +#if BLAKE3_USE_RVV == 1 + blake3_hash_many_rvv(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); +#endif + blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter, increment_counter, flags, flags_start, flags_end, out); diff --git a/c/blake3_impl.h b/c/blake3_impl.h index beab5cf53..6c10d8878 100644 --- a/c/blake3_impl.h +++ b/c/blake3_impl.h @@ -281,5 +281,22 @@ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs, uint8_t flags_end, uint8_t *out); #endif +#if BLAKE3_USE_RVV == 1 +void blake3_compress_in_place_rvv(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +void blake3_compress_xof_rvv(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]); + +void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t *out); +#endif #endif /* BLAKE3_IMPL_H */ diff --git a/c/blake3_rvv.c b/c/blake3_rvv.c new file mode 100644 index 000000000..ca3dd2242 --- /dev/null +++ b/c/blake3_rvv.c @@ -0,0 +1,300 @@ +#include "blake3_impl.h" +#include + +INLINE vuint32m1_t rvv_vror_v(vuint32m1_t src, size_t shr, size_t vl) { + vuint32m1_t dst = __riscv_vundefined_u32m1(); + size_t delta = vl - shr; + dst = __riscv_vslideup(dst, src, delta, vl); // VL = vl + dst = __riscv_vslidedown_tu(dst, src, shr, delta); // VL = dela + return dst; +} + +INLINE vuint32m1_t rvv_vror_s(vuint32m1_t src, size_t shr, size_t vl) { + size_t shl = sizeof(uint32_t) * 8 - shr; + vuint32m1_t op0 = __riscv_vsrl(src, shr, vl); + vuint32m1_t op1 = __riscv_vsll(src, shl, vl); + return __riscv_vor(op0, op1, vl); +} + +// NOTE: See the following for several approaches to transposing matrices with RVV: +// https://fprox.substack.com/p/transposing-a-matrix-using-risc-v +// +// This version of transpose_nxn uses the strided store approach, which easily scales to NxN +// matrices. For LMUL=1, the data at the above link suggests that this approach may be less +// efficient than a scalar implementation for large N > 16. However, for LMUL=8, the data suggests +// this vectorized approach may be more efficient than a scalar implementation at least up to N=512. +// +// If we assume a typical vector size of 128b (the minimum; no idea how representative this will +// be), then LMUL=8 gives us vector groups of 1024b (x 4, since RVV has 32 registers). This should +// let us process 32 state vectors at a time (c.f., 8 for AVX2, 16 for AVX512). Wider base vector +// registers would increase this further, of course. +// +// One of the more efficient alternative approaches would be to use segmented loads/stores. However, +// this would limit us to 8-row matrices, based on the currently supported vector tuple-sizes. +// +// With larger vector registers (or larger LMUL), the in-register masked slide approach may also be +// worth exploring. + +INLINE +void rvv_transpose_nxn(uint32_t *dst, uint32_t *src, size_t n) { + for (size_t row_idx = 0; row_idx < n; row_idx += 1) { + size_t avl = n; + uint32_t *row_src = src + row_idx * n; + uint32_t *row_dst = dst + row_idx; + for ( + /* clang-format off */ + size_t vl = __riscv_vsetvl_e32m8(avl); + 0 < avl; + avl -= vl, + row_src += vl, + row_dst += vl * n + /* clang-format on */ + ) { + vuint32m8_t row = __riscv_vle32_v_u32m8(row_src, vl); + __riscv_vsse32(row_dst, sizeof(uint32_t) * n, row, vl); + } + } +} + +INLINE +vuint32m1_t rvv_zip_lo_u32(vuint32m1_t op0, vuint32m1_t op1) { + vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1); + vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32); + op0 = __riscv_vslideup_tu(op0, op0, 1, 4); // VL = 4 + op1 = __riscv_vslideup_tu(op1, op1, 2, 4); // VL = 4 + op1 = __riscv_vslideup_tu(op1, op1, 1, 2); // VL = 2 + return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4 +} + +INLINE +vuint32m1_t rvv_zip_hi_u32(vuint32m1_t op0, vuint32m1_t op1) { + vuint32m1_t odd_mask_u32 = __riscv_vmv_v_x_u32m1(0xAAAA, 1); + vbool32_t odd_mask = __riscv_vreinterpret_v_u32m1_b32(odd_mask_u32); + op0 = __riscv_vslidedown_tu(op0, op0, 1, 4); // VL = 4 + op0 = __riscv_vslidedown_tu(op0, op0, 1, 2); // VL = 2 + op1 = __riscv_vslidedown_tu(op1, op1, 1, 2); // VL = 2 + return __riscv_vmerge_vvm_u32m1(op0, op1, odd_mask, 4); // VL = 4 +} + +INLINE +vuint32m1_t rvv_zip_lo_u64(vuint32m1_t op0, vuint32m1_t op1) { + vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0); + vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1); + vuint64m1_t dst_u64 = __riscv_vslideup_tu(op0_u64, op1_u64, 1, 2); // VL = 2 + return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64); +} + +INLINE +vuint32m1_t rvv_zip_hi_u64(vuint32m1_t op0, vuint32m1_t op1) { + vuint64m1_t op0_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op0); + vuint64m1_t op1_u64 = __riscv_vreinterpret_v_u32m1_u64m1(op1); + vuint64m1_t dst_u64 = __riscv_vslidedown_tu(op1_u64, op0_u64, 1, 1); // VL = 1 + return __riscv_vreinterpret_v_u64m1_u32m1(dst_u64); +} + +INLINE +vuint32m1_t rvv_shuffle_zip_lo_hi_u256(vuint32m1_t op0, vuint32m1_t op1, + vuint32m1_t tab) { + op1 = __riscv_vrgather_vv_u32m1(op1, tab, 4); // VL = 4 + op0 = __riscv_vrgather_vv_u32m1_tu(op1, op0, tab, 2); // VL = 2 + return op0; +} + +INLINE +vuint32m1_t rvv_shuffle_u128(vuint32m1_t src, vuint32m1_t tab) { + return __riscv_vrgather_vv_u32m1(src, tab, 4); // VL = 4 +} + +INLINE +vuint32m1_t rvv_blend_u16(vuint32m1_t op0, vuint32m1_t op1, uint16_t mask) { + vuint16m1_t op0_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op0); + vuint16m1_t op1_u16 = __riscv_vreinterpret_v_u32m1_u16m1(op1); + vbool16_t mask_u16 = __riscv_vreinterpret_v_u16m1_b16(__riscv_vmv_v_x_u16m1(mask, 1)); + vuint16m1_t dst = __riscv_vmerge_vvm_u16m1(op0_u16, op1_u16, mask_u16, 4); // VL = 4 + return __riscv_vreinterpret_v_u16m1_u32m1(dst); +} + +/* + * ---------------------------------------------------------------------------- + * compress_rvv + * ---------------------------------------------------------------------------- + */ + +INLINE void g1(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2, + vuint32m1_t *row3, vuint32m1_t m, size_t vl) { + *row0 = __riscv_vadd(*row0, m, vl); + *row0 = __riscv_vadd(*row0, *row1, vl); + *row3 = __riscv_vxor(*row3, *row0, vl); + *row3 = rvv_vror_s(*row3, 16, vl); + *row2 = __riscv_vadd(*row2, *row3, vl); + *row1 = __riscv_vxor(*row1, *row2, vl); + *row1 = rvv_vror_s(*row1, 12, vl); +} + +INLINE void g2(vuint32m1_t *row0, vuint32m1_t *row1, vuint32m1_t *row2, + vuint32m1_t *row3, vuint32m1_t m, size_t vl) { + *row0 = __riscv_vadd(*row0, m, vl); + *row0 = __riscv_vadd(*row0, *row1, vl); + *row3 = __riscv_vxor(*row3, *row0, vl); + *row3 = rvv_vror_s(*row3, 8, vl); + *row2 = __riscv_vadd(*row2, *row3, vl); + *row1 = __riscv_vxor(*row1, *row2, vl); + *row1 = rvv_vror_s(*row1, 7, vl); +} + +INLINE void diagonalize(vuint32m1_t *row0, vuint32m1_t *row2, vuint32m1_t *row3, + size_t vl) { + *row0 = rvv_vror_v(*row0, 3, vl); + *row3 = rvv_vror_v(*row3, 2, vl); + *row2 = rvv_vror_v(*row2, 1, vl); +} + +INLINE void undiagonalize(vuint32m1_t *row0, vuint32m1_t *row2, + vuint32m1_t *row3, size_t vl) { + *row0 = rvv_vror_v(*row0, 1, vl); + *row3 = rvv_vror_v(*row3, 2, vl); + *row2 = rvv_vror_v(*row2, 3, vl); +} + +INLINE void compress_pre(vuint32m1x4_t *rows, const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + size_t vl) { + (void)rows; + (void)cv; + (void)block; + (void)block_len; + (void)counter; + (void)flags; + (void)vl; + + // 0, 0, 3, 3 + // 0, 1, 3, 2 + // 1, 3, 2, 0 + // 2, 0, 2, 0 + // 3, 1, 1, 2 + // 3, 1, 3, 1 + // 3, 3, 2, 2 + + // 2, 1, 0, 3 (rotate) + // 1, 3, 2, 0 (rotate) + // 0, 3, 2, 1 (rotate) +} + +void blake3_compress_xof_rvv(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, uint8_t flags, + uint8_t out[64]) { + assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment + assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment + (void)cv; + (void)block; + (void)block_len; + (void)counter; + (void)flags; + (void)out; +} + +void blake3_compress_in_place_rvv(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags) { + assert((uintptr_t)&block[0] % sizeof(uint32_t) == 0); // FIXME: alignment + (void)cv; + (void)block; + (void)block_len; + (void)counter; + (void)flags; +} + +/* + * ---------------------------------------------------------------------------- + * hash_vl_rvv + * ---------------------------------------------------------------------------- + */ + +void round_fn_vl() { + // +} + +void transpose_vecs_vl() { + // +} + +void transpose_msg_vecs_vl() { + // +} + +void load_counters_vl() { + // +} + +void blake3_hash_vl_rvv(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out, + size_t vl) { + assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment + assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment + (void)inputs; + (void)blocks; + (void)key; + (void)counter; + (void)increment_counter; + (void)flags; + (void)flags_start; + (void)flags_end; + (void)out; + (void)vl; +} + +/* + * ---------------------------------------------------------------------------- + * hash_many_rvv + * ---------------------------------------------------------------------------- + */ + +INLINE void hash_one_rvv(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t out[BLAKE3_OUT_LEN]) { + assert((uintptr_t)&input[0] % sizeof(uint32_t) == 0); // FIXME: alignment + assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + block_flags |= blocks == 1 ? flags_end : 0; + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_rvv(cv, input, BLAKE3_BLOCK_LEN, counter, + block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + memcpy(out, cv, BLAKE3_OUT_LEN); +} + +void blake3_hash_many_rvv(const uint8_t *const *inputs, size_t num_inputs, + size_t blocks, const uint32_t key[8], + uint64_t counter, bool increment_counter, + uint8_t flags, uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { + assert((uintptr_t)&inputs[0] % sizeof(uint32_t) == 0); // FIXME: alignment + assert((uintptr_t)&out[0] % sizeof(uint32_t) == 0); // FIXME: alignment + for ( + /* clang-format off */ + size_t vl = __riscv_vsetvl_e32m1(num_inputs); + num_inputs > 0; + num_inputs -= vl, + inputs += vl, + counter += increment_counter * vl, + out = &out[vl * BLAKE3_OUT_LEN] + /* clang-format on */ + ) { + blake3_hash_vl_rvv(inputs, blocks, key, counter, increment_counter, flags, + flags_start, flags_end, out, vl); + } +} diff --git a/src/ffi_rvv.rs b/src/ffi_rvv.rs new file mode 100644 index 000000000..ded1692aa --- /dev/null +++ b/src/ffi_rvv.rs @@ -0,0 +1,113 @@ +use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN}; + +// Unsafe because this may only be called on platforms supporting RVV. +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + ffi::blake3_compress_in_place_rvv(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags) +} + +// Unsafe because this may only be called on platforms supporting RVV. +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut out = [0u8; 64]; + ffi::blake3_compress_xof_rvv( + cv.as_ptr(), + block.as_ptr(), + block_len, + counter, + flags, + out.as_mut_ptr(), + ); + out +} + +// Unsafe because this may only be called on platforms supporting RVV. +pub unsafe fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + // The Rust hash_many implementations do bounds checking on the `out` + // array, but the C implementations don't. Even though this is an unsafe + // function, assert the bounds here. + assert!(out.len() >= inputs.len() * OUT_LEN); + ffi::blake3_hash_many_rvv( + inputs.as_ptr() as *const *const u8, + inputs.len(), + N / BLOCK_LEN, + key.as_ptr(), + counter, + increment_counter.yes(), + flags, + flags_start, + flags_end, + out.as_mut_ptr(), + ) +} + +pub mod ffi { + extern "C" { + pub fn blake3_compress_in_place_rvv( + cv: *mut u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + ); + pub fn blake3_compress_xof_rvv( + cv: *const u32, + block: *const u8, + block_len: u8, + counter: u64, + flags: u8, + out: *mut u8, + ); + pub fn blake3_hash_many_rvv( + inputs: *const *const u8, + num_inputs: usize, + blocks: usize, + key: *const u32, + counter: u64, + increment_counter: bool, + flags: u8, + flags_start: u8, + flags_end: u8, + out: *mut u8, + ); + } +} + +#[cfg(test)] +mod test { + use super::*; + + // This entire file is gated on feature="rvv", so RVV support is + // assumed here. + + // TODO: test transpose? + + #[test] + fn test_compress() { + crate::test::test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + crate::test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/src/lib.rs b/src/lib.rs index 1fe47bf54..0810b35c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,6 +66,10 @@ //! enabling this feature will produce a binary that's not portable to CPUs //! without NEON support. //! +//! The RVV implementation requires the `rvv` feature for RISC-V targets. Not +//! all RISC-V CPUs support RVV, and enabling this feature will produce a binary +//! that's not portable to CPUs without RVV support. +//! //! The `traits-preview` feature enables implementations of traits from the //! RustCrypto [`digest`] crate, and re-exports that crate as `traits::digest`. //! However, the traits aren't stable, and they're expected to change in @@ -114,6 +118,9 @@ mod avx512; #[path = "ffi_neon.rs"] mod neon; mod portable; +#[cfg(blake3_rvv)] +#[path = "ffi_rvv.rs"] +mod rvv; #[cfg(blake3_sse2_rust)] #[path = "rust_sse2.rs"] mod sse2; diff --git a/src/platform.rs b/src/platform.rs index ef910aae5..0135a5798 100644 --- a/src/platform.rs +++ b/src/platform.rs @@ -51,6 +51,8 @@ pub enum Platform { AVX512, #[cfg(blake3_neon)] NEON, + #[cfg(blake3_rvv)] + RVV, } impl Platform { @@ -80,6 +82,12 @@ impl Platform { { return Platform::NEON; } + // We don't use dynamic feature detection for RVV. If the "rvv" + // feature is on, RVV is assumed to be supported. + #[cfg(blake3_rvv)] + { + return Platform::RVV; + } Platform::Portable } @@ -97,6 +105,8 @@ impl Platform { Platform::AVX512 => 16, #[cfg(blake3_neon)] Platform::NEON => 4, + #[cfg(blake3_rvv)] + Platform::RVV => todo!(), }; debug_assert!(degree <= MAX_SIMD_DEGREE); degree @@ -131,6 +141,10 @@ impl Platform { // No NEON compress_in_place() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), + #[cfg(blake3_rvv)] + Platform::RVV => unsafe { + crate::rvv::compress_in_place(cv, block, block_len, counter, flags) + }, } } @@ -163,6 +177,11 @@ impl Platform { // No NEON compress_xof() implementation yet. #[cfg(blake3_neon)] Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), + // No NEON compress_xof() implementation yet. + #[cfg(blake3_rvv)] + Platform::RVV => unsafe { + crate::rvv::compress_xof(cv, block, block_len, counter, flags) + }, } } @@ -269,6 +288,20 @@ impl Platform { out, ) }, + // Assumed to be safe if the "rvv" feature is on. + #[cfg(blake3_rvv)] + Platform::RVV => unsafe { + crate::rvv::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, } } @@ -320,6 +353,12 @@ impl Platform { // Assumed to be safe if the "neon" feature is on. Some(Self::NEON) } + + #[cfg(blake3_rvv)] + pub fn rvv() -> Option { + // Assumed to be safe if the "rvv" feature is on. + Some(Self::RVV) + } } // Note that AVX-512 is divided into multiple featuresets, and we use two of