Skip to content

Commit

Permalink
Add binary SFPU OPs - eltwise & bitwise (#47)
Browse files Browse the repository at this point in the history
* Add binary eltwise sfpu LLKs

* Add SFPU binary bitwise, int32 add, div
  • Loading branch information
rdjogoTT authored Dec 6, 2024
1 parent eadfdb4 commit ed02df9
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 5 deletions.
2 changes: 2 additions & 0 deletions common/inc/ckernel_sfpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

#include "sfpu/ckernel_sfpu_abs.h"
#include "sfpu/ckernel_sfpu_add_int32.h"
#include "sfpu/ckernel_sfpu_binary.h"
#include "sfpu/ckernel_sfpu_binary_bitwise.h"
#include "sfpu/ckernel_sfpu_cast_fp32_to_fp16a.h"
#include "sfpu/ckernel_sfpu_clamp.h"
#include "sfpu/ckernel_sfpu_comp.h"
Expand Down
2 changes: 1 addition & 1 deletion common/inc/sfpu/ckernel_sfpu_add_int32.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _add_int32_(const int iterations, const uint dst_offset) {
inline void _add_int32_(const uint dst_offset) {
// Operand A is input1 (int32)
// Operand B is input2 (int32)
// Output is int32
Expand Down
139 changes: 139 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_binary.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel.h"
#include "ckernel_defs.h"
#include "noc_nonblocking_api.h"
#include "ckernel_sfpu_exp.h"
#include "sfpi.h"

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

enum {
ADD_BINARY = 0,
SUB_BINARY = 1,
MUL_BINARY = 2,
DIV_BINARY = 3,
RSUB_BINARY = 4,
POW_BINARY = 5
}; // BINOP_MODE

sfpi_inline vFloat _calculate_sfpu_binary_power_(vFloat base, vFloat pow)
{
vFloat original_base = base;

// Check for integer power
vInt pow_int = float_to_int16(pow, 0); // int16 should be plenty, since large powers will approach 0/Inf
vFloat pow_rounded = int32_to_float(pow_int, 0);
v_if (pow_rounded == pow) {
// if pow is integer, set base to positive
base = setsgn(base, 0);
}
v_endif;

// Normalize base to calculation range
vFloat x = setexp(base, 127); // set exp to exp bias (put base in range of 1-2)

// 3rd order polynomial approx - determined using rminimax over [1,2]
vFloat series_result = x * (x * (x * 0x2.44734p-4f - 0xd.e712ap-4f) + 0x2.4f5388p+0f) - 0x1.952992p+0f;

// Convert exponent to float
vInt exp = exexp(base);
v_if (exp < 0) {
exp = setsgn(~exp + 1, 1);
}
v_endif;
vFloat expf = int32_to_float(exp, 0);

// De-normalize to original range
vFloat vConstLn2 = 0.692871f;
vFloat log_result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2)

// Base case when input is 0. ln(0) = -inf
v_if (base == 0.0f) { // Reload for register pressure
log_result = -std::numeric_limits<float>::infinity();
}
v_endif;

// Take exp(pow * log(base)) to produce base^pow
vFloat val = pow * log_result;

// Force sign to 0 (make number positive)
vFloat result = _sfpu_exp_(setsgn(val, 0));

v_if (val < 0) {
result = _sfpu_reciprocal_(result);
}
v_endif;

// Check valid base range
v_if (original_base < 0.0f) { // negative base
// Check for integer power
v_if (pow_rounded == pow) {
// if pow is odd integer, set result to negative
v_if (pow_int & 0x1) {
result = setsgn(result, 1);
}
v_endif;
} v_else {
result = std::numeric_limits<float>::quiet_NaN();
}
v_endif;
}
v_endif;

return result;
}

template <bool APPROXIMATION_MODE, int BINOP_MODE, int ITERATIONS = 8>
inline void _calculate_sfpu_binary_(const uint dst_offset)
{
// SFPU microcode
for (int d = 0; d < ITERATIONS; d++) {
constexpr uint dst_tile_size = 32;
vFloat in0 = dst_reg[0];
vFloat in1 = dst_reg[dst_offset * dst_tile_size];
vFloat result = 0.0f;

if constexpr (BINOP_MODE == ADD_BINARY) {
result = in0 + in1;
} else if constexpr (BINOP_MODE == SUB_BINARY) {
result = in0 - in1;
} else if constexpr (BINOP_MODE == MUL_BINARY) {
result = in0 * in1;
} else if constexpr (BINOP_MODE == DIV_BINARY) {
v_if (in1 == 0) {
v_if (in0 == 0) {
result = std::numeric_limits<float>::quiet_NaN();
} v_else {
result = std::numeric_limits<float>::infinity();
result = setsgn(result, in0);
}
v_endif;
} v_elseif (in0 == in1) {
result = vConst1;
} v_else {
result = in0 * setsgn(_sfpu_reciprocal_<4>(in1), in1);
}
v_endif;
} else if constexpr (BINOP_MODE == RSUB_BINARY) {
result = in1 - in0;
} else if constexpr (BINOP_MODE == POW_BINARY) {
result = _calculate_sfpu_binary_power_(in0, in1);
}

dst_reg[0] = result;
dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
50 changes: 50 additions & 0 deletions common/inc/sfpu/ckernel_sfpu_binary_bitwise.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "ckernel.h"
#include "ckernel_defs.h"
#include "noc_nonblocking_api.h"
#include "sfpi.h"
#include <limits.h>

using namespace sfpi;

namespace ckernel
{
namespace sfpu
{

enum {
AND_BINARY = 0,
OR_BINARY = 1,
XOR_BINARY = 2,
}; // BITWISE_MODE

template <bool APPROXIMATION_MODE, int BITWISE_MODE, int ITERATIONS = 8>
inline void _calculate_sfpu_binary_bitwise_(const uint dst_offset)
{
// SFPU microcode
for (int d = 0; d < ITERATIONS; d++) {
constexpr uint dst_tile_size = 64;

TTI_SFPLOAD(0,4,3,0);
TT_SFPLOAD(1,4,3,dst_offset*dst_tile_size);

if constexpr (BITWISE_MODE == AND_BINARY) {
TTI_SFPAND(0,1,0,0);
} else if constexpr (BITWISE_MODE == OR_BINARY) {
TTI_SFPOR(0,1,0,0);
} else if constexpr (BITWISE_MODE == XOR_BINARY) {
TTI_SFPXOR(0,1,0,0);
}

TTI_SFPSTORE(0,4,3,0);
dst_reg++;
}
}

} // namespace sfpu
} // namespace ckernel
8 changes: 4 additions & 4 deletions common/inc/sfpu/ckernel_sfpu_quant.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace sfpu
{

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _quant_int32_(const int iterations, const uint dst_offset)
inline void _quant_int32_(const uint dst_offset)
{
// Operand A is input (fp32)
// Operand B is scaling factor (fp32)
Expand All @@ -44,7 +44,7 @@ inline void _quant_int32_(const int iterations, const uint dst_offset)
}

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _requant_int32_(const int iterations, const uint dst_offset)
inline void _requant_int32_(const uint dst_offset)
{
// Operand A is input to requant (int32)
// Operand B is scaling factor (fp32)
Expand Down Expand Up @@ -72,7 +72,7 @@ inline void _requant_int32_(const int iterations, const uint dst_offset)
}

template <bool APPROXIMATION_MODE, int ITERATIONS>
inline void _dequant_int32_(const int iterations, const uint dst_offset)
inline void _dequant_int32_(const uint dst_offset)
{
// Operand A[LREG0] is input to dequant (int32)
// Operand B[LREG1] is scaling factor (fp32)
Expand All @@ -98,7 +98,7 @@ inline void _dequant_int32_(const int iterations, const uint dst_offset)
}
}

inline void init_quant_zero_point(const uint zero_point)
inline void _init_quant_zero_point_(const uint zero_point)
{
_sfpu_load_imm32_(2,zero_point);
}
Expand Down
1 change: 1 addition & 0 deletions llk_lib/llk_math_eltwise_binary_sfpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ inline void _llk_math_eltwise_binary_sfpu_inc_dst_face_addr_() {
math::inc_dst_addr<8>();
}

template <SfpuType sfpu_op>
inline void _llk_math_eltwise_binary_sfpu_init_() {
sfpu::_init_sfpu_config_reg();
eltwise_binary_sfpu_configure_addrmod();
Expand Down

0 comments on commit ed02df9

Please sign in to comment.