diff --git a/common/inc/ckernel_sfpu.h b/common/inc/ckernel_sfpu.h index 1e0f9a9..3d4848f 100644 --- a/common/inc/ckernel_sfpu.h +++ b/common/inc/ckernel_sfpu.h @@ -14,6 +14,8 @@ #include "sfpu/ckernel_sfpu_abs.h" #include "sfpu/ckernel_sfpu_add_int32.h" +#include "sfpu/ckernel_sfpu_binary.h" +#include "sfpu/ckernel_sfpu_binary_bitwise.h" #include "sfpu/ckernel_sfpu_cast_fp32_to_fp16a.h" #include "sfpu/ckernel_sfpu_clamp.h" #include "sfpu/ckernel_sfpu_comp.h" diff --git a/common/inc/sfpu/ckernel_sfpu_add_int32.h b/common/inc/sfpu/ckernel_sfpu_add_int32.h index 0264cdb..71547ad 100644 --- a/common/inc/sfpu/ckernel_sfpu_add_int32.h +++ b/common/inc/sfpu/ckernel_sfpu_add_int32.h @@ -18,7 +18,7 @@ namespace sfpu { template -inline void _add_int32_(const int iterations, const uint dst_offset) { +inline void _add_int32_(const uint dst_offset) { // Operand A is input1 (int32) // Operand B is input2 (int32) // Output is int32 diff --git a/common/inc/sfpu/ckernel_sfpu_binary.h b/common/inc/sfpu/ckernel_sfpu_binary.h new file mode 100644 index 0000000..f0c7d7d --- /dev/null +++ b/common/inc/sfpu/ckernel_sfpu_binary.h @@ -0,0 +1,139 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" +#include "ckernel_sfpu_exp.h" +#include "sfpi.h" + +using namespace sfpi; + +namespace ckernel +{ +namespace sfpu +{ + +enum { + ADD_BINARY = 0, + SUB_BINARY = 1, + MUL_BINARY = 2, + DIV_BINARY = 3, + RSUB_BINARY = 4, + POW_BINARY = 5 +}; // BINOP_MODE + +sfpi_inline vFloat _calculate_sfpu_binary_power_(vFloat base, vFloat pow) +{ + vFloat original_base = base; + + // Check for integer power + vInt pow_int = float_to_int16(pow, 0); // int16 should be plenty, since large powers will approach 0/Inf + vFloat pow_rounded = int32_to_float(pow_int, 0); + v_if (pow_rounded == pow) { + // if pow is integer, set base to positive + base = setsgn(base, 0); + } + v_endif; + + // Normalize base to calculation range + vFloat x = setexp(base, 127); // set exp to exp bias (put base in range of 1-2) + + // 3rd order polynomial approx - determined using rminimax over [1,2] + vFloat series_result = x * (x * (x * 0x2.44734p-4f - 0xd.e712ap-4f) + 0x2.4f5388p+0f) - 0x1.952992p+0f; + + // Convert exponent to float + vInt exp = exexp(base); + v_if (exp < 0) { + exp = setsgn(~exp + 1, 1); + } + v_endif; + vFloat expf = int32_to_float(exp, 0); + + // De-normalize to original range + vFloat vConstLn2 = 0.692871f; + vFloat log_result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2) + + // Base case when input is 0. ln(0) = -inf + v_if (base == 0.0f) { // Reload for register pressure + log_result = -std::numeric_limits::infinity(); + } + v_endif; + + // Take exp(pow * log(base)) to produce base^pow + vFloat val = pow * log_result; + + // Force sign to 0 (make number positive) + vFloat result = _sfpu_exp_(setsgn(val, 0)); + + v_if (val < 0) { + result = _sfpu_reciprocal_(result); + } + v_endif; + + // Check valid base range + v_if (original_base < 0.0f) { // negative base + // Check for integer power + v_if (pow_rounded == pow) { + // if pow is odd integer, set result to negative + v_if (pow_int & 0x1) { + result = setsgn(result, 1); + } + v_endif; + } v_else { + result = std::numeric_limits::quiet_NaN(); + } + v_endif; + } + v_endif; + + return result; +} + +template +inline void _calculate_sfpu_binary_(const uint dst_offset) +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) { + constexpr uint dst_tile_size = 32; + vFloat in0 = dst_reg[0]; + vFloat in1 = dst_reg[dst_offset * dst_tile_size]; + vFloat result = 0.0f; + + if constexpr (BINOP_MODE == ADD_BINARY) { + result = in0 + in1; + } else if constexpr (BINOP_MODE == SUB_BINARY) { + result = in0 - in1; + } else if constexpr (BINOP_MODE == MUL_BINARY) { + result = in0 * in1; + } else if constexpr (BINOP_MODE == DIV_BINARY) { + v_if (in1 == 0) { + v_if (in0 == 0) { + result = std::numeric_limits::quiet_NaN(); + } v_else { + result = std::numeric_limits::infinity(); + result = setsgn(result, in0); + } + v_endif; + } v_elseif (in0 == in1) { + result = vConst1; + } v_else { + result = in0 * setsgn(_sfpu_reciprocal_<4>(in1), in1); + } + v_endif; + } else if constexpr (BINOP_MODE == RSUB_BINARY) { + result = in1 - in0; + } else if constexpr (BINOP_MODE == POW_BINARY) { + result = _calculate_sfpu_binary_power_(in0, in1); + } + + dst_reg[0] = result; + dst_reg++; + } +} + +} // namespace sfpu +} // namespace ckernel diff --git a/common/inc/sfpu/ckernel_sfpu_binary_bitwise.h b/common/inc/sfpu/ckernel_sfpu_binary_bitwise.h new file mode 100644 index 0000000..c0df40c --- /dev/null +++ b/common/inc/sfpu/ckernel_sfpu_binary_bitwise.h @@ -0,0 +1,50 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "ckernel.h" +#include "ckernel_defs.h" +#include "noc_nonblocking_api.h" +#include "sfpi.h" +#include + +using namespace sfpi; + +namespace ckernel +{ +namespace sfpu +{ + +enum { + AND_BINARY = 0, + OR_BINARY = 1, + XOR_BINARY = 2, +}; // BITWISE_MODE + +template +inline void _calculate_sfpu_binary_bitwise_(const uint dst_offset) +{ + // SFPU microcode + for (int d = 0; d < ITERATIONS; d++) { + constexpr uint dst_tile_size = 64; + + TTI_SFPLOAD(0,4,3,0); + TT_SFPLOAD(1,4,3,dst_offset*dst_tile_size); + + if constexpr (BITWISE_MODE == AND_BINARY) { + TTI_SFPAND(0,1,0,0); + } else if constexpr (BITWISE_MODE == OR_BINARY) { + TTI_SFPOR(0,1,0,0); + } else if constexpr (BITWISE_MODE == XOR_BINARY) { + TTI_SFPXOR(0,1,0,0); + } + + TTI_SFPSTORE(0,4,3,0); + dst_reg++; + } +} + +} // namespace sfpu +} // namespace ckernel diff --git a/common/inc/sfpu/ckernel_sfpu_quant.h b/common/inc/sfpu/ckernel_sfpu_quant.h index 89a321b..0ade271 100644 --- a/common/inc/sfpu/ckernel_sfpu_quant.h +++ b/common/inc/sfpu/ckernel_sfpu_quant.h @@ -19,7 +19,7 @@ namespace sfpu { template -inline void _quant_int32_(const int iterations, const uint dst_offset) +inline void _quant_int32_(const uint dst_offset) { // Operand A is input (fp32) // Operand B is scaling factor (fp32) @@ -44,7 +44,7 @@ inline void _quant_int32_(const int iterations, const uint dst_offset) } template -inline void _requant_int32_(const int iterations, const uint dst_offset) +inline void _requant_int32_(const uint dst_offset) { // Operand A is input to requant (int32) // Operand B is scaling factor (fp32) @@ -72,7 +72,7 @@ inline void _requant_int32_(const int iterations, const uint dst_offset) } template -inline void _dequant_int32_(const int iterations, const uint dst_offset) +inline void _dequant_int32_(const uint dst_offset) { // Operand A[LREG0] is input to dequant (int32) // Operand B[LREG1] is scaling factor (fp32) @@ -98,7 +98,7 @@ inline void _dequant_int32_(const int iterations, const uint dst_offset) } } -inline void init_quant_zero_point(const uint zero_point) +inline void _init_quant_zero_point_(const uint zero_point) { _sfpu_load_imm32_(2,zero_point); } diff --git a/llk_lib/llk_math_eltwise_binary_sfpu.h b/llk_lib/llk_math_eltwise_binary_sfpu.h index 7e34d68..39f1203 100644 --- a/llk_lib/llk_math_eltwise_binary_sfpu.h +++ b/llk_lib/llk_math_eltwise_binary_sfpu.h @@ -53,6 +53,7 @@ inline void _llk_math_eltwise_binary_sfpu_inc_dst_face_addr_() { math::inc_dst_addr<8>(); } +template inline void _llk_math_eltwise_binary_sfpu_init_() { sfpu::_init_sfpu_config_reg(); eltwise_binary_sfpu_configure_addrmod();