Add binary SFPU OPs - eltwise & bitwise (#47)

* Add binary eltwise sfpu LLKs * Add SFPU binary bitwise, int32 add, div
tenstorrent · Dec 6, 2024 · ed02df9 · ed02df9
1 parent eadfdb4
commit ed02df9
Show file tree

Hide file tree

Showing 6 changed files with 197 additions and 5 deletions.
diff --git a/common/inc/ckernel_sfpu.h b/common/inc/ckernel_sfpu.h
@@ -14,6 +14,8 @@
 
 #include "sfpu/ckernel_sfpu_abs.h"
 #include "sfpu/ckernel_sfpu_add_int32.h"
+#include "sfpu/ckernel_sfpu_binary.h"
+#include "sfpu/ckernel_sfpu_binary_bitwise.h"
 #include "sfpu/ckernel_sfpu_cast_fp32_to_fp16a.h"
 #include "sfpu/ckernel_sfpu_clamp.h"
 #include "sfpu/ckernel_sfpu_comp.h"

diff --git a/common/inc/sfpu/ckernel_sfpu_add_int32.h b/common/inc/sfpu/ckernel_sfpu_add_int32.h
@@ -18,7 +18,7 @@ namespace sfpu
 {
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void _add_int32_(const int iterations, const uint dst_offset) {
+inline void _add_int32_(const uint dst_offset) {
     // Operand A is input1 (int32)
     // Operand B is input2 (int32)
     // Output is int32

diff --git a/common/inc/sfpu/ckernel_sfpu_binary.h b/common/inc/sfpu/ckernel_sfpu_binary.h
@@ -0,0 +1,139 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+#include "ckernel_sfpu_exp.h"
+#include "sfpi.h"
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+enum {
+    ADD_BINARY = 0,
+    SUB_BINARY = 1,
+    MUL_BINARY = 2,
+    DIV_BINARY = 3,
+    RSUB_BINARY = 4,
+    POW_BINARY = 5
+};  // BINOP_MODE
+
+sfpi_inline vFloat _calculate_sfpu_binary_power_(vFloat base, vFloat pow)
+{
+    vFloat original_base = base;
+
+    // Check for integer power
+    vInt pow_int = float_to_int16(pow, 0); // int16 should be plenty, since large powers will approach 0/Inf
+    vFloat pow_rounded = int32_to_float(pow_int, 0);
+    v_if (pow_rounded == pow) {
+        // if pow is integer, set base to positive
+        base = setsgn(base, 0);
+    }
+    v_endif;
+
+    // Normalize base to calculation range
+    vFloat x = setexp(base, 127);    // set exp to exp bias (put base in range of 1-2)
+
+    // 3rd order polynomial approx - determined using rminimax over [1,2]
+    vFloat series_result = x * (x * (x * 0x2.44734p-4f - 0xd.e712ap-4f) + 0x2.4f5388p+0f) - 0x1.952992p+0f;
+
+    // Convert exponent to float
+    vInt exp = exexp(base);
+    v_if (exp < 0) {
+        exp = setsgn(~exp + 1, 1);
+    }
+    v_endif;
+    vFloat expf = int32_to_float(exp, 0);
+
+    // De-normalize to original range
+    vFloat vConstLn2 = 0.692871f;
+    vFloat log_result = expf * vConstLn2 + series_result; // exp correction: ln(1+x) + exp*ln(2)
+
+    // Base case when input is 0. ln(0) = -inf
+    v_if (base == 0.0f) { // Reload for register pressure
+        log_result = -std::numeric_limits<float>::infinity();
+    }
+    v_endif;
+
+    // Take exp(pow * log(base)) to produce base^pow
+    vFloat val = pow * log_result;
+
+    // Force sign to 0 (make number positive)
+    vFloat result = _sfpu_exp_(setsgn(val, 0));
+
+    v_if (val < 0) {
+        result = _sfpu_reciprocal_(result);
+    }
+    v_endif;
+
+    // Check valid base range
+    v_if (original_base < 0.0f) { // negative base
+        // Check for integer power
+        v_if (pow_rounded == pow) {
+            // if pow is odd integer, set result to negative
+            v_if (pow_int & 0x1) {
+                result = setsgn(result, 1);
+            }
+            v_endif;
+        } v_else {
+            result = std::numeric_limits<float>::quiet_NaN();
+        }
+        v_endif;
+    }
+    v_endif;
+
+    return result;
+}
+
+template <bool APPROXIMATION_MODE, int BINOP_MODE, int ITERATIONS = 8>
+inline void _calculate_sfpu_binary_(const uint dst_offset)
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++) {
+        constexpr uint dst_tile_size = 32;
+        vFloat in0 = dst_reg[0];
+        vFloat in1 = dst_reg[dst_offset * dst_tile_size];
+        vFloat result = 0.0f;
+
+        if constexpr (BINOP_MODE == ADD_BINARY) {
+            result = in0 + in1;
+        } else if constexpr (BINOP_MODE == SUB_BINARY) {
+            result = in0 - in1;
+        } else if constexpr (BINOP_MODE == MUL_BINARY) {
+            result = in0 * in1;
+        } else if constexpr (BINOP_MODE == DIV_BINARY) {
+            v_if (in1 == 0) {
+                v_if (in0 == 0) {
+                    result = std::numeric_limits<float>::quiet_NaN();
+                } v_else {
+                    result = std::numeric_limits<float>::infinity();
+                    result = setsgn(result, in0);
+                }
+                v_endif;
+            } v_elseif (in0 == in1) {
+                result = vConst1;
+            } v_else {
+                result = in0 * setsgn(_sfpu_reciprocal_<4>(in1), in1);
+            }
+            v_endif;
+        } else if constexpr (BINOP_MODE == RSUB_BINARY) {
+            result = in1 - in0;
+        } else if constexpr (BINOP_MODE == POW_BINARY) {
+            result = _calculate_sfpu_binary_power_(in0, in1);
+        }
+
+        dst_reg[0] = result;
+        dst_reg++;
+    }
+}
+
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/common/inc/sfpu/ckernel_sfpu_binary_bitwise.h b/common/inc/sfpu/ckernel_sfpu_binary_bitwise.h
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "ckernel.h"
+#include "ckernel_defs.h"
+#include "noc_nonblocking_api.h"
+#include "sfpi.h"
+#include <limits.h>
+
+using namespace sfpi;
+
+namespace ckernel
+{
+namespace sfpu
+{
+
+enum {
+    AND_BINARY = 0,
+    OR_BINARY = 1,
+    XOR_BINARY = 2,
+};  // BITWISE_MODE
+
+template <bool APPROXIMATION_MODE, int BITWISE_MODE, int ITERATIONS = 8>
+inline void _calculate_sfpu_binary_bitwise_(const uint dst_offset)
+{
+    // SFPU microcode
+    for (int d = 0; d < ITERATIONS; d++) {
+        constexpr uint dst_tile_size = 64;
+
+        TTI_SFPLOAD(0,4,3,0);
+        TT_SFPLOAD(1,4,3,dst_offset*dst_tile_size);
+
+        if constexpr (BITWISE_MODE == AND_BINARY) {
+            TTI_SFPAND(0,1,0,0);
+        } else if constexpr (BITWISE_MODE == OR_BINARY) {
+            TTI_SFPOR(0,1,0,0);
+        } else if constexpr (BITWISE_MODE == XOR_BINARY) {
+            TTI_SFPXOR(0,1,0,0);
+        }
+
+        TTI_SFPSTORE(0,4,3,0);
+        dst_reg++;
+    }
+}
+
+}  // namespace sfpu
+}  // namespace ckernel
diff --git a/common/inc/sfpu/ckernel_sfpu_quant.h b/common/inc/sfpu/ckernel_sfpu_quant.h
@@ -19,7 +19,7 @@ namespace sfpu
 {
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void _quant_int32_(const int iterations, const uint dst_offset)
+inline void _quant_int32_(const uint dst_offset)
 {
     // Operand A is input (fp32)
     // Operand B is scaling factor (fp32)
@@ -44,7 +44,7 @@ inline void _quant_int32_(const int iterations, const uint dst_offset)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void _requant_int32_(const int iterations, const uint dst_offset)
+inline void _requant_int32_(const uint dst_offset)
 {
     // Operand A is input to requant (int32)
     // Operand B is scaling factor (fp32)
@@ -72,7 +72,7 @@ inline void _requant_int32_(const int iterations, const uint dst_offset)
 }
 
 template <bool APPROXIMATION_MODE, int ITERATIONS>
-inline void _dequant_int32_(const int iterations, const uint dst_offset)
+inline void _dequant_int32_(const uint dst_offset)
 {
     // Operand A[LREG0] is input to dequant (int32)
     // Operand B[LREG1] is scaling factor (fp32)
@@ -98,7 +98,7 @@ inline void _dequant_int32_(const int iterations, const uint dst_offset)
     }
 }
 
-inline void init_quant_zero_point(const uint zero_point)
+inline void _init_quant_zero_point_(const uint zero_point)
 {
     _sfpu_load_imm32_(2,zero_point);
 }

diff --git a/llk_lib/llk_math_eltwise_binary_sfpu.h b/llk_lib/llk_math_eltwise_binary_sfpu.h
@@ -53,6 +53,7 @@ inline void _llk_math_eltwise_binary_sfpu_inc_dst_face_addr_() {
     math::inc_dst_addr<8>();
 }
 
+template <SfpuType sfpu_op>
 inline void _llk_math_eltwise_binary_sfpu_init_() {
     sfpu::_init_sfpu_config_reg();
     eltwise_binary_sfpu_configure_addrmod();