-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for parallelization along the width for untilize with unp…
…adding (#17538) ### Ticket Link to Github Issue #17537 ### Problem description Currently, the untilize with unpadding implementation supports parallelization only along the height dimension. This affects perf for wide tensors, as they are mapped to a limited number of cores. ### What's changed In this PR, we introduce support for parallelizing the untiling operation along the width dimension, similar to tilize with padding. The operation executes the parallelization over the dimension with the larger number of tiles. In future versions: - we want the operation to support the parallelization along both dimensions simultaneously - we want the compute kernel to support the processing of an entire column block at once instead of one tile at a time For the tests added in test_to_layout.py, the kernel duration of the previous implementation is around 1.8 to 24.8 times larger than the current implementation ### Checklist - [x] Post commit CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13121055787 - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes
- Loading branch information
Showing
6 changed files
with
424 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
26 changes: 26 additions & 0 deletions
26
ttnn/cpp/ttnn/operations/data_movement/untilize/device/kernels/compute/untilize_w.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include "compute_kernel_api/untilize.h" | ||
#include "debug/dprint.h" | ||
|
||
namespace NAMESPACE { | ||
void MAIN { | ||
uint32_t per_core_block_cnt = get_compile_time_arg_val(0); | ||
uint32_t per_core_block_tile_cnt = get_compile_time_arg_val(1); | ||
uint32_t third_dim = get_compile_time_arg_val(2); | ||
untilize_init(tt::CBIndex::c_0, tt::CBIndex::c_16); | ||
|
||
uint32_t onetile = 1; | ||
for (uint32_t b = 0; b < per_core_block_cnt * per_core_block_tile_cnt * third_dim; ++b) { | ||
cb_wait_front(tt::CBIndex::c_0, onetile); | ||
cb_reserve_back(tt::CBIndex::c_16, onetile); | ||
|
||
untilize_block(tt::CBIndex::c_0, onetile, tt::CBIndex::c_16); | ||
|
||
cb_push_back(tt::CBIndex::c_16, onetile); | ||
cb_pop_front(tt::CBIndex::c_0, onetile); | ||
} | ||
} | ||
} // namespace NAMESPACE |
82 changes: 82 additions & 0 deletions
82
...tilize_with_unpadding/device/kernels/dataflow/writer_unary_stick_layout_col_multicore.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include <stdint.h> | ||
|
||
#include "dataflow_api.h" | ||
|
||
void kernel_main() { | ||
constexpr uint32_t cb_id_out0 = 16; | ||
|
||
const uint32_t total_num_rows = get_compile_time_arg_val(3); | ||
const uint32_t ncores = get_compile_time_arg_val(4); | ||
const uint32_t third_dim = get_compile_time_arg_val(5); | ||
const uint32_t tile_width = get_compile_time_arg_val(6); | ||
|
||
const uint32_t dst_addr = get_arg_val<uint32_t>(0); | ||
const uint32_t unpadded_X_size = get_arg_val<uint32_t>(1); | ||
const uint32_t core_number = get_arg_val<uint32_t>(2); | ||
|
||
constexpr bool dst0_is_dram = get_compile_time_arg_val(0) == 1; | ||
|
||
#define stick_size_is_pow2 get_compile_time_arg_val(1) == 1 | ||
#if (stick_size_is_pow2) | ||
constexpr uint32_t log_base_2_of_page_size = get_compile_time_arg_val(2); | ||
const InterleavedPow2AddrGen<dst0_is_dram> s = { | ||
.bank_base_address = dst_addr, .log_base_2_of_page_size = log_base_2_of_page_size}; | ||
#else | ||
const InterleavedAddrGen<dst0_is_dram> s = {.bank_base_address = dst_addr, .page_size = unpadded_X_size}; | ||
#endif | ||
|
||
auto write_block = [&](uint32_t num_rows, | ||
uint32_t mul, | ||
uint32_t size_per_row_per_block, | ||
uint32_t start_id, | ||
uint32_t width_size, | ||
uint32_t size_2d) { | ||
uint32_t onetile = 1; | ||
bool has_rows = (num_rows) > 0; | ||
|
||
cb_wait_front(cb_id_out0, onetile * has_rows); | ||
uint32_t l1_read_addr = get_write_ptr(cb_id_out0); | ||
|
||
for (uint32_t k = 0; k < num_rows; k++) { | ||
uint64_t dst_noc_addr = get_noc_addr(size_2d + k, s); | ||
|
||
uint32_t total_size = mul * size_per_row_per_block + start_id + width_size; | ||
uint32_t padded_size = total_size - unpadded_X_size; | ||
uint32_t write_size = width_size; | ||
|
||
if (mul == ncores - 1 && padded_size > 0) { | ||
write_size = width_size - padded_size; | ||
} | ||
|
||
noc_async_write(l1_read_addr, dst_noc_addr + start_id + mul * size_per_row_per_block, write_size); | ||
|
||
noc_async_write_barrier(); | ||
|
||
if (k > 0 && (k % tile_width == 0)) { | ||
cb_pop_front(cb_id_out0, onetile * has_rows); | ||
cb_wait_front(cb_id_out0, onetile * has_rows); | ||
} | ||
l1_read_addr += width_size; | ||
} | ||
|
||
cb_pop_front(cb_id_out0, onetile * has_rows); | ||
}; | ||
|
||
const uint32_t size_per_row_per_block = get_arg_val<uint32_t>(3); | ||
const uint32_t blocks_per_core = get_arg_val<uint32_t>(4); | ||
const uint32_t width_size = get_arg_val<uint32_t>(5); | ||
|
||
uint32_t size_2d = 0; | ||
for (uint32_t dim3 = 0; dim3 < third_dim; dim3++) { | ||
uint32_t start_id = 0; | ||
for (uint32_t b = 0; b < blocks_per_core; b++) { | ||
write_block(total_num_rows, core_number, size_per_row_per_block, start_id, width_size, size_2d); | ||
start_id += width_size; | ||
} | ||
size_2d += total_num_rows; | ||
} | ||
} |
Oops, something went wrong.