Add support for parallelization along the width for untilize with unp…

…adding (#17538) ### Ticket Link to Github Issue #17537 ### Problem description Currently, the untilize with unpadding implementation supports parallelization only along the height dimension. This affects perf for wide tensors, as they are mapped to a limited number of cores. ### What's changed In this PR, we introduce support for parallelizing the untiling operation along the width dimension, similar to tilize with padding. The operation executes the parallelization over the dimension with the larger number of tiles. In future versions: - we want the operation to support the parallelization along both dimensions simultaneously - we want the compute kernel to support the processing of an entire column block at once instead of one tile at a time For the tests added in test_to_layout.py, the kernel duration of the previous implementation is around 1.8 to 24.8 times larger than the current implementation   ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13121055787 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes
tenstorrent · Feb 5, 2025 · 1b01e8b · 1b01e8b
1 parent fb0d2fa
commit 1b01e8b
Show file tree

Hide file tree

Showing 6 changed files with 424 additions and 6 deletions.
diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_untilize_with_unpadding.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_untilize_with_unpadding.py
@@ -21,22 +21,53 @@ def create_grid(x, y):
 
 
 params = [
-    pytest.param([[5, 5, 32, 32]], untilize_with_unpadding_args)
-    for untilize_with_unpadding_args in generation_funcs.gen_untilize_with_unpadding_args([[5, 5, 32, 32]])
+    pytest.param(
+        [[5, 5, 32, 32]],
+        {
+            "dtype": [ttnn.bfloat16],
+            "layout": [ttnn.TILE_LAYOUT],
+            "input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM)],
+            "output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1),
+            "output_tensor_end": [4, 4, 31, 28],
+        },
+    )
 ]
+
 params += [
-    pytest.param([[5, 5, 64, 96]], untilize_with_unpadding_args)
-    for untilize_with_unpadding_args in generation_funcs.gen_untilize_with_unpadding_args([[5, 5, 64, 96]])
+    pytest.param(
+        [[5, 5, 64, 96]],
+        {
+            "dtype": [ttnn.bfloat16],
+            "layout": [ttnn.TILE_LAYOUT],
+            "input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM)],
+            "output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
+            "output_tensor_end": [4, 4, 60, 90],
+        },
+    )
 ]
 
 params += [
     pytest.param(
-        [[1, 1, 128, 7328]],
+        [[5, 5, 64, 96]],
         {
             "dtype": [ttnn.bfloat16],
             "layout": [ttnn.TILE_LAYOUT],
-            "input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM)],
+            "input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1)],
             "output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.DRAM),
+            "output_tensor_end": [4, 4, 60, 90],
+        },
+    )
+]
+
+
+params += [
+    pytest.param(
+        [[1, 1, 128, 7328]],
+        {
+            "dtype": [ttnn.bfloat16],
+            "layout": [ttnn.TILE_LAYOUT],
+            "input_mem_config": [ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1)],
+            "output_mem_config": ttnn.MemoryConfig(ttnn.TensorMemoryLayout.INTERLEAVED, ttnn.BufferType.L1),
             "output_tensor_end": [0, 0, 119, 7299],
         },
     )

diff --git a/tests/ttnn/unit_tests/test_to_layout.py b/tests/ttnn/unit_tests/test_to_layout.py
@@ -283,3 +283,59 @@ def test_to_layout_page_error(shape, device):
     torch_output = torch_tensor
     assert torch_output.shape == output_tensor.shape
     assert_with_pcc(torch_output, output_tensor, 0.9999)
+
+
+@pytest.mark.parametrize("shape", [[64, 7680]])
+@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
+@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
+def test_untilize_w1(shape, input_layout, output_layout, device):
+    torch.manual_seed(0)
+    input_a = torch.randn(shape, dtype=torch.bfloat16)
+
+    input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
+    output_tensor = ttnn.untilize_with_unpadding(input_tensor, [36, 7667])
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    assert_with_pcc(input_a[:37, :7668], output_tensor)
+
+
+@pytest.mark.parametrize("shape", [[2, 32, 6144]])
+@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
+@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
+def test_untilize_w2(shape, input_layout, output_layout, device):
+    torch.manual_seed(0)
+    input_a = torch.randn(shape, dtype=torch.bfloat16)
+
+    input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
+    output_tensor = ttnn.untilize_with_unpadding(input_tensor, [1, 30, 6140])
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    assert_with_pcc(input_a[:, :31, :6141], output_tensor)
+
+
+@pytest.mark.parametrize("shape", [[1, 1, 32, 1536]])
+@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
+@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
+def test_untilize_w3(shape, input_layout, output_layout, device):
+    torch.manual_seed(0)
+    input_a = torch.randn(shape, dtype=torch.bfloat16)
+
+    input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
+    output_tensor = ttnn.untilize_with_unpadding(input_tensor, [0, 0, 31, 1535])
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    assert_with_pcc(input_a[:, :, :32, :1536], output_tensor)
+
+
+@pytest.mark.parametrize("shape", [[1, 1, 32, 10912]])
+@pytest.mark.parametrize("output_layout", [ttnn.ROW_MAJOR_LAYOUT])
+@pytest.mark.parametrize("input_layout", [ttnn.TILE_LAYOUT])
+def test_untilize_w4(shape, input_layout, output_layout, device):
+    torch.manual_seed(0)
+    input_a = torch.randn(shape, dtype=torch.bfloat16)
+
+    input_tensor = ttnn.from_torch(input_a, device=device, layout=input_layout, dtype=ttnn.bfloat16)
+    output_tensor = ttnn.untilize_with_unpadding(input_tensor, [0, 0, 0, 10911])
+    output_tensor = ttnn.to_torch(output_tensor)
+
+    assert_with_pcc(input_a[:, :, :1, :10912], output_tensor)
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize_w.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize_w.cpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "compute_kernel_api/untilize.h"
+#include "debug/dprint.h"
+
+namespace NAMESPACE {
+void MAIN {
+    uint32_t per_core_block_cnt = get_compile_time_arg_val(0);
+    uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1);
+    uint32_t third_dim = get_compile_time_arg_val(2);
+    untilize_init(tt::CBIndex::c_0, tt::CBIndex::c_16);
+
+    uint32_t onetile = 1;
+    for (uint32_t b = 0; b < per_core_block_cnt * per_core_block_tile_cnt * third_dim; ++b) {
+        cb_wait_front(tt::CBIndex::c_0, onetile);
+        cb_reserve_back(tt::CBIndex::c_16, onetile);
+
+        untilize_block(tt::CBIndex::c_0, onetile, tt::CBIndex::c_16);
+
+        cb_push_back(tt::CBIndex::c_16, onetile);
+        cb_pop_front(tt::CBIndex::c_0, onetile);
+    }
+}
+}  // namespace NAMESPACE
diff --git a/...tilize_with_unpadding/device/kernels/dataflow/writer_unary_stick_layout_col_multicore.cpp b/...tilize_with_unpadding/device/kernels/dataflow/writer_unary_stick_layout_col_multicore.cpp
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    constexpr uint32_t cb_id_out0 = 16;
+
+    const uint32_t total_num_rows = get_compile_time_arg_val(3);
+    const uint32_t ncores = get_compile_time_arg_val(4);
+    const uint32_t third_dim = get_compile_time_arg_val(5);
+    const uint32_t tile_width = get_compile_time_arg_val(6);
+
+    const uint32_t dst_addr = get_arg_val<uint32_t>(0);
+    const uint32_t unpadded_X_size = get_arg_val<uint32_t>(1);
+    const uint32_t core_number = get_arg_val<uint32_t>(2);
+
+    constexpr bool dst0_is_dram = get_compile_time_arg_val(0) == 1;
+
+#define stick_size_is_pow2 get_compile_time_arg_val(1) == 1
+#if (stick_size_is_pow2)
+    constexpr uint32_t log_base_2_of_page_size = get_compile_time_arg_val(2);
+    const InterleavedPow2AddrGen<dst0_is_dram> s = {
+        .bank_base_address = dst_addr, .log_base_2_of_page_size = log_base_2_of_page_size};
+#else
+    const InterleavedAddrGen<dst0_is_dram> s = {.bank_base_address = dst_addr, .page_size = unpadded_X_size};
+#endif
+
+    auto write_block = [&](uint32_t num_rows,
+                           uint32_t mul,
+                           uint32_t size_per_row_per_block,
+                           uint32_t start_id,
+                           uint32_t width_size,
+                           uint32_t size_2d) {
+        uint32_t onetile = 1;
+        bool has_rows = (num_rows) > 0;
+
+        cb_wait_front(cb_id_out0, onetile * has_rows);
+        uint32_t l1_read_addr = get_write_ptr(cb_id_out0);
+
+        for (uint32_t k = 0; k < num_rows; k++) {
+            uint64_t dst_noc_addr = get_noc_addr(size_2d + k, s);
+
+            uint32_t total_size = mul * size_per_row_per_block + start_id + width_size;
+            uint32_t padded_size = total_size - unpadded_X_size;
+            uint32_t write_size = width_size;
+
+            if (mul == ncores - 1 && padded_size > 0) {
+                write_size = width_size - padded_size;
+            }
+
+            noc_async_write(l1_read_addr, dst_noc_addr + start_id + mul * size_per_row_per_block, write_size);
+
+            noc_async_write_barrier();
+
+            if (k > 0 && (k % tile_width == 0)) {
+                cb_pop_front(cb_id_out0, onetile * has_rows);
+                cb_wait_front(cb_id_out0, onetile * has_rows);
+            }
+            l1_read_addr += width_size;
+        }
+
+        cb_pop_front(cb_id_out0, onetile * has_rows);
+    };
+
+    const uint32_t size_per_row_per_block = get_arg_val<uint32_t>(3);
+    const uint32_t blocks_per_core = get_arg_val<uint32_t>(4);
+    const uint32_t width_size = get_arg_val<uint32_t>(5);
+
+    uint32_t size_2d = 0;
+    for (uint32_t dim3 = 0; dim3 < third_dim; dim3++) {
+        uint32_t start_id = 0;
+        for (uint32_t b = 0; b < blocks_per_core; b++) {
+            write_block(total_num_rows, core_number, size_per_row_per_block, start_id, width_size, size_2d);
+            start_id += width_size;
+        }
+        size_2d += total_num_rows;
+    }
+}