tenstorrent · patrickroberts · Jan 27, 2025
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_eltwise_unary.py
@@ -740,6 +740,7 @@ def test_run_eltwise_unary_comp(
 
     @pytest.mark.parametrize("unary_kind", ["add_unary", "sub_unary", "mul_unary", "div_unary"])
     @pytest.mark.parametrize("scalar", [-2.0, 1.0, 2.0, 8.0])
+    @skip_for_grayskull()
     def test_run_eltwise_binop_to_unary_ops(
         self,
         unary_kind,

@@ -138,7 +138,10 @@ INSTANTIATE_TEST_SUITE_P(
                 .b_Shape = ttnn::Shape(tt::tt_metal::Array4D{1, 3, 32, 32}),
                 .memory_config = ttnn::L1_MEMORY_CONFIG,
                 .expected_calltrace =
-                    {"ttnn::add", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor"},
+                    {"ttnn::add",
+                     "ttnn::prim::binary_ng",
+                     "BinaryNgDeviceOperation",
+                     "tt::tt_metal::create_device_tensor"},
                 .expected_peak_L1_memory_usage = 30720,
                 .expected_intermediate_tensors_count = 0,
                 .expected_cb_peak_per_core = 3 * 4096,
@@ -154,34 +157,14 @@ INSTANTIATE_TEST_SUITE_P(
                 .memory_config = ttnn::L1_MEMORY_CONFIG,
                 .expected_calltrace =
                     {"ttnn::add",
-                     "ttnn::repeat",
-                     "ttnn::to_layout",
-                     "ttnn::untilize",
-                     "ttnn::prim::old_infra_device_operation",
-                     "Untilize",
-                     "tt::tt_metal::create_device_tensor",
-                     "ttnn::view",
-                     "ttnn::experimental::view",
-                     "Tensor::reshape",
-                     "ttnn::prim::old_infra_device_operation",
-                     "RepeatDeviceOperation",
-                     "tt::tt_metal::create_device_tensor",
-                     "ttnn::view",
-                     "ttnn::experimental::view",
-                     "Tensor::reshape",
-                     "ttnn::to_layout",
-                     "ttnn::tilize",
-                     "ttnn::prim::old_infra_device_operation",
-                     "Tilize",
-                     "tt::tt_metal::create_device_tensor",
-                     "ttnn::prim::binary",
-                     "BinaryDeviceOperation",
+                     "ttnn::prim::binary_ng",
+                     "BinaryNgDeviceOperation",
                      "tt::tt_metal::create_device_tensor"},
-                .expected_peak_L1_memory_usage = 92160,
+                .expected_peak_L1_memory_usage = 67584,
                 .expected_intermediate_tensors_count = 0,
                 .expected_cb_peak_per_core = 3 * 4096,
                 .expected_l1_output_per_core = 2048,
-                .expected_l1_peak_per_core = 2 * 2048,
+                .expected_l1_peak_per_core = 2048,
                 .expected_output_info = {graph::TensorInfo{
                     .shape = ttnn::Shape(tt::tt_metal::Array4D{4, 3, 32, 32}),
                     .size = 24576,
@@ -199,7 +182,10 @@ INSTANTIATE_TEST_SUITE_P(
                              {6 * 32, 32 * 32},
                              ShardOrientation::COL_MAJOR}},
                 .expected_calltrace =
-                    {"ttnn::add", "ttnn::prim::binary", "BinaryDeviceOperation", "tt::tt_metal::create_device_tensor"},
+                    {"ttnn::add",
+                     "ttnn::prim::binary_ng",
+                     "BinaryNgDeviceOperation",
+                     "tt::tt_metal::create_device_tensor"},
                 .expected_peak_L1_memory_usage = 20054016,
                 .expected_intermediate_tensors_count = 0,
                 .expected_cb_peak_per_core = 0,

@@ -414,27 +414,27 @@ INSTANTIATE_TEST_SUITE_P(
             ResourceUsageMap{
                 {BoardType::N300,
                  ttnn::graph::ResourceUsage{
-                     .cb_peak_size_per_core = 57344,
-                     .l1_buffers_peak_per_core = 26688,
+                     .cb_peak_size_per_core = 3 * (2 * 2 * 32 * 32),
+                     .l1_buffers_peak_per_core = 10240,
                      .l1_output_buffer_per_core = 10240}},
                 {BoardType::E150,
                  ttnn::graph::ResourceUsage{
-                     .cb_peak_size_per_core = 57344,
-                     .l1_buffers_peak_per_core = 14720,
+                     .cb_peak_size_per_core = 3 * (2 * 2 * 32 * 32),
+                     .l1_buffers_peak_per_core = 6144,
                      .l1_output_buffer_per_core = 6144}}}),
         std::make_tuple(  // broadcast
             g_interleave_4_2_160_244_tiled,
             g_interleave_1_2_160_244_tiled,
             ResourceUsageMap{
                 {BoardType::N300,
                  ttnn::graph::ResourceUsage{
-                     .cb_peak_size_per_core = 57344,
-                     .l1_buffers_peak_per_core = 26688,
+                     .cb_peak_size_per_core = 3 * (2 * 2 * 32 * 32),
+                     .l1_buffers_peak_per_core = 10240,
                      .l1_output_buffer_per_core = 10240}},
                 {BoardType::E150,
                  ttnn::graph::ResourceUsage{
-                     .cb_peak_size_per_core = 57344,
-                     .l1_buffers_peak_per_core = 14720,
+                     .cb_peak_size_per_core = 3 * (2 * 2 * 32 * 32),
+                     .l1_buffers_peak_per_core = 6144,
                      .l1_output_buffer_per_core = 6144}}})),
     [](const testing::TestParamInfo<std::tuple<ttnn::TensorSpec, ttnn::TensorSpec, ResourceUsageMap>>& info) {
         std::stringstream ss;

@@ -8,8 +8,8 @@
 
 import ttnn
 
+from models.utility_functions import skip_for_grayskull
 from tests.ttnn.utils_for_testing import assert_with_pcc
-from torch.nn import functional as F
 
 
 # fmt: off
@@ -103,7 +103,8 @@ def test_multiply_int32_with_scalar(device, input_a, scalar):
 @pytest.mark.parametrize("output_memory_config", [ttnn.DRAM_MEMORY_CONFIG])
 @pytest.mark.parametrize("scalar", [0.125])
 @pytest.mark.parametrize("batch_size", [6, 7, 8])
-def test_multiply_with_scalar_sharded(device, scalar, batch_size, output_memory_config):
+@skip_for_grayskull()
+def test_multiply_float32_with_scalar_sharded(device, scalar, batch_size, output_memory_config):
     torch.manual_seed(0)
     torch_input_tensor_a = torch.rand((batch_size, 16, 384, 384), dtype=torch.float32)
     torch_output_tensor = scalar * torch_input_tensor_a